Index: clang/docs/ClangCommandLineReference.rst =================================================================== --- clang/docs/ClangCommandLineReference.rst +++ clang/docs/ClangCommandLineReference.rst @@ -2982,6 +2982,10 @@ Specify SRAM ECC mode (AMDGPU only) +.. option:: -mtgsplit, -mno-tgsplit + +Enable threadgroup split execution mode (AMDGPU only) + .. option:: -mxnack, -mno-xnack Specify XNACK mode (AMDGPU only) Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -258,5 +258,13 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16, "V16fV2sV2sV16fIiIiIi", "nc", "mai-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8bf16, "V4fV2sV2sV4fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4bf16_1k, "V32fV4sV4sV32fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x4bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_4x4x4bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x8bf16_1k, "V16fV4sV4sV16fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts") + #undef BUILTIN #undef TARGET_BUILTIN Index: clang/include/clang/Basic/Cuda.h =================================================================== --- clang/include/clang/Basic/Cuda.h +++ clang/include/clang/Basic/Cuda.h @@ -78,6 +78,7 @@ GFX906, GFX908, GFX909, + GFX90a, GFX90c, GFX1010, GFX1011, Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -3100,6 +3100,11 @@ HelpText<"Specify CU (-mcumode) or WGP (-mno-cumode) wavefront execution mode (AMDGPU only)">; def mno_cumode : Flag<["-"], "mno-cumode">, Group; +def mtgsplit : Flag<["-"], "mtgsplit">, Group, + HelpText<"Enable threadgroup split execution mode (AMDGPU only)">; +def mno_tgsplit : Flag<["-"], "mno-tgsplit">, Group, + HelpText<"Disable threadgroup split execution mode (AMDGPU only)">; + def mwavefrontsize64 : Flag<["-"], "mwavefrontsize64">, Group, HelpText<"Specify wavefront size 64 mode (AMDGPU only)">; def mno_wavefrontsize64 : Flag<["-"], "mno-wavefrontsize64">, Group, Index: clang/lib/Basic/Cuda.cpp =================================================================== --- clang/lib/Basic/Cuda.cpp +++ clang/lib/Basic/Cuda.cpp @@ -98,6 +98,7 @@ GFX(906), // gfx906 GFX(908), // gfx908 GFX(909), // gfx909 + GFX(90a), // gfx90a GFX(90c), // gfx90c GFX(1010), // gfx1010 GFX(1011), // gfx1011 Index: clang/lib/Basic/Targets/AMDGPU.cpp =================================================================== --- clang/lib/Basic/Targets/AMDGPU.cpp +++ clang/lib/Basic/Targets/AMDGPU.cpp @@ -212,6 +212,9 @@ Features["s-memrealtime"] = true; Features["s-memtime-inst"] = true; break; + case GK_GFX90A: + Features["gfx90a-insts"] = true; + LLVM_FALLTHROUGH; case GK_GFX908: Features["dot3-insts"] = true; Features["dot4-insts"] = true; Index: clang/lib/Basic/Targets/NVPTX.cpp =================================================================== --- clang/lib/Basic/Targets/NVPTX.cpp +++ clang/lib/Basic/Targets/NVPTX.cpp @@ -202,6 +202,7 @@ case CudaArch::GFX906: case CudaArch::GFX908: case CudaArch::GFX909: + case CudaArch::GFX90a: case CudaArch::GFX90c: case CudaArch::GFX1010: case CudaArch::GFX1011: Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -4634,6 +4634,7 @@ case CudaArch::GFX906: case CudaArch::GFX908: case CudaArch::GFX909: + case CudaArch::GFX90a: case CudaArch::GFX90c: case CudaArch::GFX1010: case CudaArch::GFX1011: @@ -4703,6 +4704,7 @@ case CudaArch::GFX906: case CudaArch::GFX908: case CudaArch::GFX909: + case CudaArch::GFX90a: case CudaArch::GFX90c: case CudaArch::GFX1010: case CudaArch::GFX1011: Index: clang/test/CodeGenOpenCL/amdgpu-features.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-features.cl +++ clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -23,6 +23,7 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx906 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX906 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX908 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s @@ -52,6 +53,7 @@ // GFX906: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX908: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" +// GFX90A: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+flat-address-space,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst" Index: clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -1,5 +1,6 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A #pragma OPENCL EXTENSION cl_khr_fp64:enable @@ -19,143 +20,199 @@ typedef double v4d __attribute__((ext_vector_type(4))); -// CHECK-LABEL: @test_mfma_f32_32x32x1f32 -// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) +#ifdef MFMA_GFX908_TESTS + +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x1f32 +// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x1f32(global v32f* out, float a, float b, v32f c) { *out = __builtin_amdgcn_mfma_f32_32x32x1f32(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x1f32 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x1f32 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x1f32(global v16f* out, float a, float b, v16f c) { *out = __builtin_amdgcn_mfma_f32_16x16x1f32(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_4x4x1f32 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x1f32 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_4x4x1f32(global v4f* out, float a, float b, v4f c) { *out = __builtin_amdgcn_mfma_f32_4x4x1f32(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_32x32x2f32 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2f32 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float %a, float %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x2f32(global v16f* out, float a, float b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x2f32(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x4f32 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f32 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float %a, float %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x4f32(global v4f* out, float a, float b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x4f32(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_32x32x4f16 -// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4f16 +// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %a, <4 x half> %b, <32 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x4f16(global v32f* out, v4h a, v4h b, v32f c) { *out = __builtin_amdgcn_mfma_f32_32x32x4f16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x4f16 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x4f16 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x4f16(global v16f* out, v4h a, v4h b, v16f c) { *out = __builtin_amdgcn_mfma_f32_16x16x4f16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_4x4x4f16 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x4f16 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_4x4x4f16(global v4f* out, v4h a, v4h b, v4f c) { *out = __builtin_amdgcn_mfma_f32_4x4x4f16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_32x32x8f16 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x8f16 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %a, <4 x half> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x8f16(global v16f* out, v4h a, v4h b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x8f16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x16f16 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x16f16 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %a, <4 x half> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x16f16(global v4f* out, v4h a, v4h b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x16f16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_i32_32x32x4i8 -// CHECK: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x4i8 +// CHECK-GFX908: call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 %a, i32 %b, <32 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_32x32x4i8(global v32i* out, int a, int b, v32i c) { *out = __builtin_amdgcn_mfma_i32_32x32x4i8(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_i32_16x16x4i8 -// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x4i8 +// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x4i8(global v16i* out, int a, int b, v16i c) { *out = __builtin_amdgcn_mfma_i32_16x16x4i8(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_i32_4x4x4i8 -// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_i32_4x4x4i8 +// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_4x4x4i8(global v4i* out, int a, int b, v4i c) { *out = __builtin_amdgcn_mfma_i32_4x4x4i8(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_i32_32x32x8i8 -// CHECK: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_i32_32x32x8i8 +// CHECK-GFX908: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 %a, i32 %b, <16 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_32x32x8i8(global v16i* out, int a, int b, v16i c) { *out = __builtin_amdgcn_mfma_i32_32x32x8i8(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_i32_16x16x16i8 -// CHECK: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_i32_16x16x16i8 +// CHECK-GFX908: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 %a, i32 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c) { *out = __builtin_amdgcn_mfma_i32_16x16x16i8(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_32x32x2bf16 -// CHECK: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16 +// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c) { *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x2bf16 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c) { *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_4x4x2bf16 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c) { *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_32x32x4bf16 -// CHECK: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16 +// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0); } -// CHECK-LABEL: @test_mfma_f32_16x16x8bf16 -// CHECK: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16 +// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0); } +#endif // MFMA_GFX908_TESTS + +#ifdef MFMA_GFX90A_TESTS + +// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k +// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k +// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k +// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k +// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k +// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f64_16x16x4f64 +// CHECK-GFX90A: call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %c, i32 0, i32 0, i32 0) +void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c) +{ + *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, 0); +} + +// CHECK-GFX90A-LABEL: @test_mfma_f64_4x4x4f64 +// CHECK-GFX90A: call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %c, i32 0, i32 0, i32 0) +void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) +{ + *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, 0); +} + +#endif // MFMA_GFX90A_TESTS Index: clang/test/Driver/amdgpu-features.c =================================================================== --- clang/test/Driver/amdgpu-features.c +++ clang/test/Driver/amdgpu-features.c @@ -22,6 +22,11 @@ // RUN: %clang -### -target amdgcn-amdhsa -mcpu=gfx908:sramecc- %s 2>&1 | FileCheck --check-prefix=NO-SRAM-ECC %s // NO-SRAM-ECC: "-target-feature" "-sramecc" +// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mtgsplit %s 2>&1 | FileCheck --check-prefix=TGSPLIT %s +// RUN: %clang -### -target amdgcn -mcpu=gfx90A -mno-tgsplit %s 2>&1 | FileCheck --check-prefix=NO-TGSPLIT %s +// TGSPLIT: "-target-feature" "+tgsplit" +// NO-TGSPLIT: "-target-feature" "-tgsplit" + // RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s // RUN: %clang -### -target amdgcn-amdpal -mcpu=gfx1010 -mno-wavefrontsize64 -mwavefrontsize64 %s 2>&1 | FileCheck --check-prefix=WAVE64 %s // WAVE64: "-target-feature" "+wavefrontsize64" Index: clang/test/Driver/amdgpu-macros.cl =================================================================== --- clang/test/Driver/amdgpu-macros.cl +++ clang/test/Driver/amdgpu-macros.cl @@ -105,6 +105,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx906 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx908 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909 +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011 Index: clang/test/Driver/amdgpu-mcpu.cl =================================================================== --- clang/test/Driver/amdgpu-mcpu.cl +++ clang/test/Driver/amdgpu-mcpu.cl @@ -90,6 +90,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx906 %s 2>&1 | FileCheck --check-prefix=GFX906 %s // RUN: %clang -### -target amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefix=GFX908 %s // RUN: %clang -### -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefix=GFX909 %s +// RUN: %clang -### -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A %s // RUN: %clang -### -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefix=GFX90C %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s @@ -120,6 +121,7 @@ // GFX906: "-target-cpu" "gfx906" // GFX908: "-target-cpu" "gfx908" // GFX909: "-target-cpu" "gfx909" +// GFX90A: "-target-cpu" "gfx90a" // GFX90C: "-target-cpu" "gfx90c" // GFX1010: "-target-cpu" "gfx1010" // GFX1011: "-target-cpu" "gfx1011" Index: clang/test/Driver/cuda-bad-arch.cu =================================================================== --- clang/test/Driver/cuda-bad-arch.cu +++ clang/test/Driver/cuda-bad-arch.cu @@ -25,6 +25,8 @@ // RUN: | FileCheck -check-prefix OK %s // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=gfx908 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s +// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=gfx90a -c %s 2>&1 \ +// RUN: | FileCheck -check-prefix OK %s // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s Index: clang/test/Driver/hip-toolchain-features.hip =================================================================== --- clang/test/Driver/hip-toolchain-features.hip +++ clang/test/Driver/hip-toolchain-features.hip @@ -48,6 +48,18 @@ // ALL3: {{.*}}clang{{.*}}"-target-feature" "+sramecc" "-target-feature" "+xnack" // NOALL3: {{.*}}clang{{.*}}"-target-feature" "-sramecc" "-target-feature" "-xnack" +// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx1010 %s \ +// RUN: -mtgsplit 2>&1 | FileCheck %s -check-prefix=TGSPLIT +// RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx1010 %s \ +// RUN: -mno-tgsplit 2>&1 | FileCheck %s -check-prefix=NOTTGSPLIT + +// TGSPLIT: {{.*}}clang{{.*}}"-target-feature" "+tgsplit" +// NOTTGSPLIT: {{.*}}clang{{.*}}"-target-feature" "-tgsplit" +// TGSPLIT: {{.*}}lld{{.*}} "-plugin-opt=-mattr=+tgsplit" +// NOTTGSPLIT: {{.*}}lld{{.*}} "-plugin-opt=-mattr=-tgsplit" + // RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -nogpulib \ // RUN: --cuda-gpu-arch=gfx1010 %s \ // RUN: -mcumode -mcumode -mno-cumode -mwavefrontsize64 -mcumode \ Index: clang/test/Misc/target-invalid-cpu-note.c =================================================================== --- clang/test/Misc/target-invalid-cpu-note.c +++ clang/test/Misc/target-invalid-cpu-note.c @@ -86,7 +86,7 @@ // AMDGCN-SAME: gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, // AMDGCN-SAME: gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, // AMDGCN-SAME: gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, -// AMDGCN-SAME: gfx908, gfx909, gfx90c, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, +// AMDGCN-SAME: gfx908, gfx909, gfx90a, gfx90c, gfx1010, gfx1011, gfx1012, gfx1030, gfx1031, // AMDGCN-SAME: gfx1032, gfx1033 // RUN: not %clang_cc1 -triple wasm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix WEBASM Index: clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl =================================================================== --- /dev/null +++ clang/test/SemaOpenCL/builtins-amdgcn-error-gfx90a-param.cl @@ -0,0 +1,67 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx90a -verify -S -o - %s + +#pragma OPENCL EXTENSION cl_khr_fp64:enable + +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef float v32f __attribute__((ext_vector_type(32))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef half v16h __attribute__((ext_vector_type(16))); +typedef half v32h __attribute__((ext_vector_type(32))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef int v16i __attribute__((ext_vector_type(16))); +typedef int v32i __attribute__((ext_vector_type(32))); +typedef short v2s __attribute__((ext_vector_type(2))); +typedef short v4s __attribute__((ext_vector_type(4))); +typedef short v16s __attribute__((ext_vector_type(16))); +typedef short v32s __attribute__((ext_vector_type(32))); +typedef double v4d __attribute__((ext_vector_type(4))); + +void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4bf16_1k' must be a constant integer}} +} + +void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x4bf16_1k' must be a constant integer}} +} + +void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_4x4x4bf16_1k' must be a constant integer}} +} + +void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x8bf16_1k' must be a constant integer}} +} + +void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x16bf16_1k' must be a constant integer}} +} + +void test_mfma_f64_16x16x4f64(global v4d* out, double a, double b, v4d c, int d) +{ + *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f64_16x16x4f64(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_16x16x4f64' must be a constant integer}} +} + +void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c, int d) +{ + *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f64_4x4x4f64(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f64_4x4x4f64' must be a constant integer}} +} Index: llvm/docs/AMDGPUUsage.rst =================================================================== --- llvm/docs/AMDGPUUsage.rst +++ llvm/docs/AMDGPUUsage.rst @@ -354,6 +354,13 @@ Add product names. + ``gfx90a`` ``amdgcn`` dGPU - sramecc - Absolute - *rocm-amdhsa* *TBA* + - tgsplit flat + - xnack scratch .. TODO:: + - Packed + work-item Add product + IDs names. + ``gfx90c`` ``amdgcn`` APU - xnack - Absolute - *pal-amdpal* - Ryzen 7 4700G flat - Ryzen 7 4700GE scratch - Ryzen 5 4600G @@ -481,6 +488,11 @@ loaded and executed in a process with either setting of SRAMECC. + tgsplit ``-m[no-]tgsplit`` Enable/disable generating code that assumes + work-groups are launched in threadgroup split mode. + When enabled the waves of a work-group may be + launched in different CUs. + wavefrontsize64 - ``-m[no-]wavefrontsize64`` Control the wavefront size used when generating code for kernels. When disabled native wavefront size 32 is used, when enabled @@ -1119,6 +1131,7 @@ ``EF_AMDGPU_MACH_AMDGCN_GFX906`` 0x02f ``gfx906`` ``EF_AMDGPU_MACH_AMDGCN_GFX908`` 0x030 ``gfx908`` ``EF_AMDGPU_MACH_AMDGCN_GFX909`` 0x031 ``gfx909`` + ``EF_AMDGPU_MACH_AMDGCN_GFX90A`` 0x03f ``gfx90a`` ``EF_AMDGPU_MACH_AMDGCN_GFX90C`` 0x032 ``gfx90c`` ``EF_AMDGPU_MACH_AMDGCN_GFX1010`` 0x033 ``gfx1010`` ``EF_AMDGPU_MACH_AMDGCN_GFX1011`` 0x034 ``gfx1011`` @@ -3594,6 +3607,14 @@ bytes 383:352 4 bytes COMPUTE_PGM_RSRC3 GFX6-GFX9 Reserved, must be 0. + GFX90A + Compute Shader (CS) + program settings used by + CP to set up + ``COMPUTE_PGM_RSRC3`` + configuration + register. See + :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. GFX10 Compute Shader (CS) program settings used by @@ -3672,6 +3693,11 @@ GFX6-GFX9 - vgprs_used 0..256 - max(0, ceil(vgprs_used / 4) - 1) + GFX90A + - vgprs_used 0..512 + - vgprs_used = align(arch_vgprs, 4) + + acc_vgprs + - max(0, ceil(vgprs_used / 8) - 1) GFX10 (wavefront size 64) - max_vgpr 1..256 - max(0, ceil(vgprs_used / 4) - 1) @@ -4099,6 +4125,29 @@ 32 **Total size 4 bytes.** ======= =================================================================================================================== +.. + + .. table:: compute_pgm_rsrc3 for GFX90A + :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table + + ======= ======= =============================== =========================================================================== + Bits Size Field Name Description + ======= ======= =============================== =========================================================================== + 5:0 6 bits ACCUM_OFFSET Offset of a first AccVGPR in the unified register file. Granularity 4. + Value 0-63. 0 - accum-offset = 4, 1 - accum-offset = 8, ..., + 63 - accum-offset = 256. + 6:15 10 Reserved, must be 0. + bits + 16 1 bit TG_SPLIT - If 0 the waves of a work-group are + launched in the same CU. + - If 1 the waves of a work-group can be + launched in different CUs. The waves + cannot use S_BARRIER or LDS. + 17:31 15 Reserved, must be 0. + bits + 32 **Total size 4 bytes.** + ======= =================================================================================================================== + .. .. table:: compute_pgm_rsrc3 for GFX10 @@ -4295,11 +4344,19 @@ VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have a VGPR number. -VGPR register initial state is defined in -:ref:`amdgpu-amdhsa-vgpr-register-set-up-order-table`. +There are different methods used for the VGPR initial state: - .. table:: VGPR Register Set Up Order - :name: amdgpu-amdhsa-vgpr-register-set-up-order-table +* Unless the *Target Properties* column of :ref:`amdgpu-processor-table` + specifies otherwise, a separate VGPR register is used per work-item ID. The + VGPR register initial state for this method is defined in + :ref:`amdgpu-amdhsa-vgpr-register-set-up-order-for-unpacked-work-item-id-method-table`. +* If *Target Properties* column of :ref:`amdgpu-processor-table` + specifies *Packed work-item IDs*, the initial value of VGPR0 register is used + for all work-item IDs. The register layout for this method is defined in + :ref:`amdgpu-amdhsa-register-layout-for-packed-work-item-id-method-table`. + + .. table:: VGPR Register Set Up Order for Unpacked Work-Item ID Method + :name: amdgpu-amdhsa-vgpr-register-set-up-order-for-unpacked-work-item-id-method-table ========== ========================== ====== ============================== VGPR Order Name Number Description @@ -4317,6 +4374,35 @@ > 1) wavefront lane. ========== ========================== ====== ============================== +.. + + .. table:: Register Layout for Packed Work-Item ID Method + :name: amdgpu-amdhsa-register-layout-for-packed-work-item-id-method-table + + ======= ======= ================ ========================================= + Bits Size Field Name Description + ======= ======= ================ ========================================= + 0:9 10 bits Work-Item Id X Work-item id in X + dimension of work-group for + wavefront lane. + + Always initialized. + + 10:19 10 bits Work-Item Id Y Work-item id in Y + dimension of work-group for + wavefront lane. + + Initialized if enable_vgpr_workitem_id > + 0, otherwise set to 0. + 20:29 10 bits Work-Item Id Z Work-item id in Z + dimension of work-group for + wavefront lane. + + Initialized if enable_vgpr_workitem_id > + 1, otherwise set to 0. + 30:31 2 bits Reserved, set to 0. + ======= ======= ================ ========================================= + The setting of registers is done by GPU CP/ADC/SPI hardware as follows: 1. SGPRs before the Work-Group Ids are set by CP using the 16 User Data @@ -4651,6 +4737,7 @@ following sections: * :ref:`amdgpu-amdhsa-memory-model-gfx6-gfx9` +* :ref:`amdgpu-amdhsa-memory-model-gfx90a` * :ref:`amdgpu-amdhsa-memory-model-gfx10` .. _amdgpu-amdhsa-memory-model-gfx6-gfx9: @@ -5914,81 +6001,93 @@ - system for OpenCL.* ============ ============ ============== ========== ================================ -.. _amdgpu-amdhsa-memory-model-gfx10: +.. _amdgpu-amdhsa-memory-model-gfx90a: -Memory Model GFX10 -++++++++++++++++++ +Memory Model GFX90A ++++++++++++++++++++ -For GFX10: +For GFX90A: * Each agent has multiple shader arrays (SA). -* Each SA has multiple work-group processors (WGP). -* Each WGP has multiple compute units (CU). +* Each SA has multiple compute units (CU). * Each CU has multiple SIMDs that execute wavefronts. -* The wavefronts for a single work-group are executed in the same - WGP. In CU wavefront execution mode the wavefronts may be executed by - different SIMDs in the same CU. In WGP wavefront execution mode the - wavefronts may be executed by different SIMDs in different CUs in the same - WGP. -* Each WGP has a single LDS memory shared by the wavefronts of the work-groups - executing on it. -* All LDS operations of a WGP are performed as wavefront wide operations in a +* The wavefronts for a single work-group are executed in the same CU but may be + executed by different SIMDs. The exception is when in tgsplit execution mode + when the wavefronts may be executed by different SIMDs in different CUs. +* Each CU has a single LDS memory shared by the wavefronts of the work-groups + executing on it. The exception is when in tgsplit execution mode when no LDS + is allocated as wavefronts of the same work-group can be in different CUs. +* All LDS operations of a CU are performed as wavefront wide operations in a global order and involve no caching. Completion is reported to a wavefront in execution order. * The LDS memory has multiple request queues shared by the SIMDs of a - WGP. Therefore, the LDS operations performed by different wavefronts of a + CU. Therefore, the LDS operations performed by different wavefronts of a work-group can be reordered relative to each other, which can result in reordering the visibility of vector memory operations with respect to LDS operations of other wavefronts in the same work-group. A ``s_waitcnt lgkmcnt(0)`` is required to ensure synchronization between LDS operations and vector memory operations between wavefronts of a work-group, but not between operations performed by the same wavefront. -* The vector memory operations are performed as wavefront wide operations. - Completion of load/store/sample operations are reported to a wavefront in - execution order of other load/store/sample operations performed by that - wavefront. -* The vector memory operations access a vector L0 cache. There is a single L0 - cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no - special action is required for coherence between the lanes of a single - wavefront. However, a ``buffer_gl0_inv`` is required for coherence between - wavefronts executing in the same work-group as they may be executing on SIMDs - of different CUs that access different L0s. A ``buffer_gl0_inv`` is also - required for coherence between wavefronts executing in different work-groups - as they may be executing on different WGPs. -* The scalar memory operations access a scalar L0 cache shared by all wavefronts - on a WGP. The scalar and vector L0 caches are not coherent. However, scalar - operations are used in a restricted way so do not impact the memory model. See - :ref:`amdgpu-amdhsa-memory-spaces`. -* The vector and scalar memory L0 caches use an L1 cache shared by all WGPs on - the same SA. Therefore, no special action is required for coherence between - the wavefronts of a single work-group. However, a ``buffer_gl1_inv`` is - required for coherence between wavefronts executing in different work-groups - as they may be executing on different SAs that access different L1s. -* The L1 caches have independent quadrants to service disjoint ranges of virtual - addresses. -* Each L0 cache has a separate request queue per L1 quadrant. Therefore, the - vector and scalar memory operations performed by different wavefronts, whether - executing in the same or different work-groups (which may be executing on - different CUs accessing different L0s), can be reordered relative to each - other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is required to ensure - synchronization between vector memory operations of different wavefronts. It - ensures a previous vector memory operation has completed before executing a - subsequent vector memory or LDS operation and so can be used to meet the - requirements of acquire, release and sequential consistency. -* The L1 caches use an L2 cache shared by all SAs on the same agent. -* The L2 cache has independent channels to service disjoint ranges of virtual - addresses. -* Each L1 quadrant of a single SA accesses a different L2 channel. Each L1 - quadrant has a separate request queue per L2 channel. Therefore, the vector - and scalar memory operations performed by wavefronts executing in different - work-groups (which may be executing on different SAs) of an agent can be - reordered relative to each other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is - required to ensure synchronization between vector memory operations of - different SAs. It ensures a previous vector memory operation has completed - before executing a subsequent vector memory and so can be used to meet the - requirements of acquire, release and sequential consistency. -* The L2 cache can be kept coherent with other agents on some targets, or ranges - of virtual addresses can be set up to bypass it to ensure system coherence. +* The vector memory operations are performed as wavefront wide operations and + completion is reported to a wavefront in execution order. The exception is + that ``flat_load/store/atomic`` instructions can report out of vector memory + order if they access LDS memory, and out of LDS operation order if they access + global memory. +* The vector memory operations access a single vector L1 cache shared by all + SIMDs a CU. Therefore: + + * No special action is required for coherence between the lanes of a single + wavefront. + + * No special action is required for coherence between wavefronts in the same + work-group since they exeute on the same CU. The exception is when in + tgsplit execution mode as wavefronts of the same work-group can be in + different CUs and so a ``buffer_wbinvl1_vol`` is required as described in + the following item. + + * A ``buffer_wbinvl1_vol`` is required for coherence between wavefronts + executing in different work-groups as they may be executing on different + CUs. + +* The scalar memory operations access a scalar L1 cache shared by all wavefronts + on a group of CUs. The scalar and vector L1 caches are not coherent. However, + scalar operations are used in a restricted way so do not impact the memory + model. See :ref:`amdgpu-amdhsa-memory-spaces`. +* The vector and scalar memory operations use an L2 cache shared by all CUs on + the same agent. + + * The L2 cache has independent channels to service disjoint ranges of virtual + addresses. + * Each CU has a separate request queue per channel. Therefore, the vector and + scalar memory operations performed by wavefronts executing in different + work-groups (which may be executing on different CUs), or the same + work-group if executing in tgsplit mode, of an agent can be reordered + relative to each other. A ``s_waitcnt vmcnt(0)`` is required to ensure + synchronization between vector memory operations of different CUs. It + ensures a previous vector memory operation has completed before executing a + subsequent vector memory or LDS operation and so can be used to meet the + requirements of acquire and release. + * The L2 cache of one agent can be kept coherent with other agents by using + the MTYPE RW (read-write) for memory local to the L2, and MTYPE NC + (non-coherent) with the PTE C-bit set for memory not local to the L2. + + * Any local memory cache lines will be automatically invalidated by writes + from CUs associated with other L2 caches, or writes from the CPU, due to + the cache probe caused by the PTE C-bit. + * XGMI accesses from the CPU to local memory may be cached on the CPU. + Subsequent access from the GPU will automatically invalidate or writeback + the CPU cache due to the L2 probe filter. + * Since all work-groups on the same agent share the same L2, no L2 + invalidation or writeback is required for coherence. + * To ensure coherence of local memory writes of work-groups in different + agents a ``buffer_wbl2`` is required. It will writeback dirty L2 cache + lines. + * To ensure coherence of local memory reads of work-groups in different + agents a ``buffer_invl2`` is required. It will invalidate non-local L2 + cache lines. + + * PCIe access from the GPU to the CPU memory can be kept coherent by using the + MTYPE UC (uncached) which bypasses the L2. Scalar memory operations are only used to access memory that is proven to not change during the execution of the kernel dispatch. This includes constant @@ -6010,49 +6109,29 @@ For kernarg backing memory: -* CP invalidates the L0 and L1 caches at the start of each kernel dispatch. -* On dGPU the kernarg backing memory is accessed as MTYPE UC (uncached) to avoid - needing to invalidate the L2 cache. +* CP invalidates the L1 cache at the start of each kernel dispatch. +* On dGPU over XGMI or PCIe the kernarg backing memory is allocated in host + memory accessed as MTYPE UC (uncached) to avoid needing to invalidate the L2 + cache. This also causes it to be treated as non-volatile and so is not + invalidated by ``*_vol``. * On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and so the L2 cache will be coherent with the CPU and other agents. Scratch backing memory (which is used for the private address space) is accessed -with MTYPE NC (non-coherent). Since the private address space is only accessed -by a single thread, and is always write-before-read, there is never a need to -invalidate these entries from the L0 or L1 caches. - -Wavefronts are executed in native mode with in-order reporting of loads and -sample instructions. In this mode vmcnt reports completion of load, atomic with -return and sample instructions in order, and the vscnt reports the completion of -store and atomic without return in order. See ``MEM_ORDERED`` field in -:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. - -Wavefronts can be executed in WGP or CU wavefront execution mode: - -* In WGP wavefront execution mode the wavefronts of a work-group are executed - on the SIMDs of both CUs of the WGP. Therefore, explicit management of the per - CU L0 caches is required for work-group synchronization. Also accesses to L1 - at work-group scope need to be explicitly ordered as the accesses from - different CUs are not ordered. -* In CU wavefront execution mode the wavefronts of a work-group are executed on - the SIMDs of a single CU of the WGP. Therefore, all global memory access by - the work-group access the same L0 which in turn ensures L1 accesses are - ordered and so do not require explicit management of the caches for - work-group synchronization. - -See ``WGP_MODE`` field in -:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table` and -:ref:`amdgpu-target-features`. +with MTYPE NC_NV (non-coherent non-volatile). Since the private address space is +only accessed by a single thread, and is always write-before-read, there is +never a need to invalidate these entries from the L1 cache. Hence all cache +invalidates are done as ``*_vol`` to only invalidate the volatile cache lines. -The code sequences used to implement the memory model for GFX10 are defined in -table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`. +The code sequences used to implement the memory model for GFX90A are defined +in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`. - .. table:: AMDHSA Memory Model Code Sequences GFX10 - :name: amdgpu-amdhsa-memory-model-code-sequences-gfx10-table + .. table:: AMDHSA Memory Model Code Sequences GFX90A + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table ============ ============ ============== ========== ================================ LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code - Ordering Sync Scope Address GFX10 + Ordering Sync Scope Address GFX90A Space ============ ============ ============== ========== ================================ **Non-Atomic** @@ -6064,12 +6143,12 @@ - !volatile & nontemporal 1. buffer/global/flat_load - slc=1 + glc=1 slc=1 - volatile 1. buffer/global/flat_load - glc=1 dlc=1 + glc=1 scc=1 2. s_waitcnt vmcnt(0) - Must happen before @@ -6091,13 +6170,14 @@ - constant - !volatile & nontemporal - 1. buffer/global/flat_store - slc=1 + 1. buffer/global/flat_store + glc=1 slc=1 - volatile 1. buffer/global/flat_store - 2. s_waitcnt vscnt(0) + scc=1 + 2. s_waitcnt vmcnt(0) - Must happen before any following volatile @@ -6124,30 +6204,40 @@ load atomic monotonic - workgroup - global 1. buffer/global/flat_load - generic glc=1 - - If CU wavefront execution + - If not TgSplit execution mode, omit glc=1. - load atomic monotonic - singlethread - local 1. ds_load - - wavefront - - workgroup + load atomic monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_load load atomic monotonic - agent - global 1. buffer/global/flat_load - - system - generic glc=1 dlc=1 + - generic glc=1 + load atomic monotonic - system - global 1. buffer/global/flat_load + - generic glc=1 scc=1 store atomic monotonic - singlethread - global 1. buffer/global/flat_store - wavefront - generic - workgroup - agent - - system - store atomic monotonic - singlethread - local 1. ds_store - - wavefront - - workgroup + store atomic monotonic - system - global 1. buffer/global/flat_store + - generic scc=1 + store atomic monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_store atomicrmw monotonic - singlethread - global 1. buffer/global/flat_atomic - wavefront - generic - workgroup - agent - - system - atomicrmw monotonic - singlethread - local 1. ds_atomic - - wavefront - - workgroup + atomicrmw monotonic - system - global 1. buffer/global/flat_atomic + - generic scc=1 + atomicrmw monotonic - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + - workgroup be used.* + + 1. ds_atomic **Acquire Atomic** ------------------------------------------------------------------------------------ load atomic acquire - singlethread - global 1. buffer/global/ds/flat_load @@ -6155,38 +6245,43 @@ - generic load atomic acquire - workgroup - global 1. buffer/global_load glc=1 - - If CU wavefront execution + - If not TgSplit execution mode, omit glc=1. 2. s_waitcnt vmcnt(0) - - If CU wavefront execution + - If not TgSplit execution + mode, omit. + - Must happen before the + following buffer_wbinvl1_vol. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution mode, omit. - Must happen before - the following buffer_gl0_inv - and before any following + any following global/generic load/load atomic/store/store atomic/atomicrmw. - - 3. buffer_gl0_inv - - - If CU wavefront execution - mode, omit. - Ensures that following loads will not see stale data. - load atomic acquire - workgroup - local 1. ds_load + load atomic acquire - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_load 2. s_waitcnt lgkmcnt(0) - If OpenCL, omit. - Must happen before - the following buffer_gl0_inv - and before any following - global/generic load/load + any following + global/generic + load/load atomic/store/store atomic/atomicrmw. - Ensures any @@ -6196,31 +6291,21 @@ atomic value being acquired. - 3. buffer_gl0_inv - - - If CU wavefront execution - mode, omit. - - If OpenCL, omit. - - Ensures that - following - loads will not see - stale data. - load atomic acquire - workgroup - generic 1. flat_load glc=1 - - If CU wavefront execution + - If not TgSplit execution mode, omit glc=1. - 2. s_waitcnt lgkmcnt(0) & - vmcnt(0) + 2. s_waitcnt lgkm/vmcnt(0) - - If CU wavefront execution - mode, omit vmcnt(0). - - If OpenCL, omit - lgkmcnt(0). + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). - Must happen before the following - buffer_gl0_inv and any + buffer_wbinvl1_vol and any following global/generic load/load atomic/store/store @@ -6232,9 +6317,9 @@ atomic value being acquired. - 3. buffer_gl0_inv + 3. buffer_wbinvl1_vol - - If CU wavefront execution + - If not TgSplit execution mode, omit. - Ensures that following @@ -6242,19 +6327,18 @@ stale data. load atomic acquire - agent - global 1. buffer/global_load - - system glc=1 dlc=1 + glc=1 2. s_waitcnt vmcnt(0) - Must happen before following - buffer_gl*_inv. + buffer_wbinvl1_vol. - Ensures the load has completed before invalidating - the caches. + the cache. - 3. buffer_gl0_inv; - buffer_gl1_inv + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6266,22 +6350,51 @@ loads will not see stale global data. - load atomic acquire - agent - generic 1. flat_load glc=1 dlc=1 - - system 2. s_waitcnt vmcnt(0) & + load atomic acquire - system - global 1. buffer/global/flat_load + glc=1 scc=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the load + has completed + before invalidating + the cache. + + 3. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + load atomic acquire - agent - generic 1. flat_load glc=1 + 2. s_waitcnt vmcnt(0) & lgkmcnt(0) + - If TgSplit execution mode, + omit lgkmcnt(0). - If OpenCL omit lgkmcnt(0). - Must happen before following - buffer_gl*_invl. + buffer_wbinvl1_vol. - Ensures the flat_load has completed before invalidating - the caches. + the cache. - 3. buffer_gl0_inv; - buffer_gl1_inv + 3. buffer_wbinvl1_vol - Must happen before any following @@ -6293,8 +6406,2345 @@ will not see stale global data. - atomicrmw acquire - singlethread - global 1. buffer/global/ds/flat_atomic - - wavefront - local + load atomic acquire - system - generic 1. flat_load glc=1 scc=1 + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL omit + lgkmcnt(0). + - Must happen before + following + buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the flat_load + has completed + before invalidating + the caches. + + 3. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acquire - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw acquire - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw acquire - workgroup - global 1. buffer/global_atomic + 2. s_waitcnt vmcnt(0) + + - If not TgSplit execution + mode, omit. + - Must happen before the + following buffer_wbinvl1_vol. + - Ensures the atomicrmw + has completed + before invalidating + the cache. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local + atomicrmw value + being acquired. + + atomicrmw acquire - workgroup - generic 1. flat_atomic + 2. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). + - Must happen before + the following + buffer_wbinvl1_vol and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local + atomicrmw value + being acquired. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acquire - agent - global 1. buffer/global_atomic + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 3. buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - system - global 1. buffer/global_atomic + scc=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 3. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acquire - agent - generic 1. flat_atomic + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 3. buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - system - generic 1. flat_atomic scc=1 + 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 3. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + fence acquire - singlethread *none* *none* + - wavefront + fence acquire - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/ + atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + buffer_wbinvl1_vol and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the + value read by the + fence-paired-atomic. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + fence acquire - agent *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + buffer_wbinvl1_vol. + - Ensures that the + fence-paired atomic + has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + 2. buffer_wbinvl1_vol + + - Must happen before any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + fence acquire - system *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures that the + fence-paired atomic + has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + 2. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + **Release Atomic** + ------------------------------------------------------------------------------------ + store atomic release - singlethread - global 1. buffer/global/flat_store + - wavefront - generic + store atomic release - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_store + store atomic release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + have + completed before + performing the + store that is being + released. + + 2. buffer/global/flat_store + store atomic release - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_store + store atomic release - agent - global 1. s_waitcnt lgkmcnt(0) & + - generic vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + to memory have + completed before + performing the + store that is being + released. + + 2. buffer/global/flat_store + store atomic release - system - global 1. buffer_wbl2 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after any + preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after any + preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + store. + - Ensures that all + memory operations + to memory and the L2 + writeback have + completed before + performing the + store that is being + released. + + 2. buffer/global/flat_store + scc=1 + atomicrmw release - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw release - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw release - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. buffer/global/flat_atomic + atomicrmw release - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + atomicrmw release - agent - global 1. s_waitcnt lgkmcnt(0) & + - generic vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and local + have completed + before performing + the atomicrmw that + is being released. + + 2. buffer/global/flat_atomic + atomicrmw release - system - global 1. buffer_wbl2 + - generic + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to memory and the L2 + writeback have + completed before + performing the + store that is being + released. + + 3. buffer/global/flat_atomic + scc=1 + fence release - singlethread *none* *none* + - wavefront + fence release - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + fence release - agent *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + fence release - system *none* 1. buffer_wbl2 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate. If + fence had an + address space then + set to address + space of OpenCL + fence flag, or to + generic if both + local and global + flags are + specified. + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + **Acquire-Release Atomic** + ------------------------------------------------------------------------------------ + atomicrmw acq_rel - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + atomicrmw acq_rel - singlethread - local *If TgSplit execution mode, + - wavefront local address space cannot + be used.* + + 1. ds_atomic + atomicrmw acq_rel - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - Must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. buffer/global_atomic + 3. s_waitcnt vmcnt(0) + + - If not TgSplit execution + mode, omit. + - Must happen before + the following + buffer_wbinvl1_vol. + - Ensures any + following global + data read is no + older than the + atomicrmw value + being acquired. + + 4. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acq_rel - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + 1. ds_atomic + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + atomicrmw acq_rel - workgroup - generic 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL, omit + lgkmcnt(0). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. flat_atomic + 3. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If not TgSplit execution + mode, omit vmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + the following + buffer_wbinvl1_vol and + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local load + atomic value being + acquired. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + atomicrmw acq_rel - agent - global 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global have + completed before + performing the + atomicrmw that is + being released. + + 2. buffer/global_atomic + 3. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 4. buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acq_rel - system - global 1. buffer_wbl2 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and L2 writeback + have completed before + performing the + atomicrmw that is + being released. + + 3. buffer/global_atomic + scc=1 + 4. s_waitcnt vmcnt(0) + + - Must happen before + following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + atomicrmw acq_rel - agent - generic 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global have + completed before + performing the + atomicrmw that is + being released. + + 2. flat_atomic + 3. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + cache. + + 4. buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acq_rel - system - generic 1. buffer_wbl2 + + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global and L2 writeback + have completed before + performing the + atomicrmw that is + being released. + + 3. flat_atomic scc=1 + 4. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + fence acq_rel - singlethread *none* *none* + - wavefront + fence acq_rel - workgroup *none* 1. s_waitcnt lgkm/vmcnt(0) + + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - If OpenCL and + address space is + local, omit + vmcnt(0). + - However, + since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/ + load atomic/store atomic/ + atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that all + memory operations + have + completed before + performing any + following global + memory operations. + - Ensures that the + preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before following + global memory + operations. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + local/generic store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + - Must happen before + the following + buffer_wbinvl1_vol. + - Ensures that the + acquire-fence-paired + atomic has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + acquire-fence-paired-atomic. + + 3. buffer_wbinvl1_vol + + - If not TgSplit execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + fence acq_rel - agent *none* 1. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + buffer_wbinvl1_vol. + - Ensures that the + preceding + global/local/generic + load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before invalidating + the cache. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + global/local/generic + store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + + 2. buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. This + satisfies the + requirements of + acquire. + + fence acq_rel - system *none* 1. buffer_wbl2 + + - If OpenCL and + address space is + local, omit. + - Must happen before + following s_waitcnt. + - Performs L2 writeback to + ensure previous + global/generic + store/atomicrmw are + visible at system scope. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - If OpenCL and + address space is + not generic, omit + lgkmcnt(0). + - However, since LLVM + currently has no + address space on + the fence need to + conservatively + always generate + (see comment for + previous fence). + - Could be split into + separate s_waitcnt + vmcnt(0) and + s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt vmcnt(0) + must happen after + any preceding + global/generic + load/store/load + atomic/store + atomic/atomicrmw. + - s_waitcnt lgkmcnt(0) + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following buffer_invl2 and + buffer_wbinvl1_vol. + - Ensures that the + preceding + global/local/generic + load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before invalidating + the cache. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + global/local/generic + store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + + 3. buffer_invl2; + buffer_wbinvl1_vol + + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + MTYPE NC global data. + MTYPE RW and CC memory will + never be stale due to the + memory probes. + + **Sequential Consistent Atomic** + ------------------------------------------------------------------------------------ + load atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local load atomic acquire, + - generic except must generated + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - global 1. s_waitcnt lgkm/vmcnt(0) + - generic + - Use lgkmcnt(0) if not + TgSplit execution mode + and vmcnt(0) if TgSplit + execution mode. + - s_waitcnt lgkmcnt(0) must + happen after + preceding + local/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + lgkmcnt(0) and so do + not need to be + considered.) + - s_waitcnt vmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + vmcnt(0) and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global/local + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + s_waitcnt of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The s_waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the s_waitcnt be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generated + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - local *If TgSplit execution mode, + local address space cannot + be used.* + + *Same as corresponding + load atomic acquire, + except must generated + all instructions even + for OpenCL.* + + load atomic seq_cst - agent - global 1. s_waitcnt lgkmcnt(0) & + - system - generic vmcnt(0) + + - If TgSplit execution mode, + omit lgkmcnt(0). + - Could be split into + separate s_waitcnt + vmcnt(0) + and s_waitcnt + lgkmcnt(0) to allow + them to be + independently moved + according to the + following rules. + - s_waitcnt lgkmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + lgkmcnt(0) and so do + not need to be + considered.) + - s_waitcnt vmcnt(0) + must happen after + preceding + global/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own s_waitcnt + vmcnt(0) and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + s_waitcnt of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The s_waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the s_waitcnt be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generated + all instructions even + for OpenCL.* + store atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local store atomic release, + - workgroup - generic except must generated + - agent all instructions even + - system for OpenCL.* + atomicrmw seq_cst - singlethread - global *Same as corresponding + - wavefront - local atomicrmw acq_rel, + - workgroup - generic except must generated + - agent all instructions even + - system for OpenCL.* + fence seq_cst - singlethread *none* *Same as corresponding + - wavefront fence acq_rel, + - workgroup except must generated + - agent all instructions even + - system for OpenCL.* + ============ ============ ============== ========== ================================ + +.. _amdgpu-amdhsa-memory-model-gfx10: + +Memory Model GFX10 +++++++++++++++++++ + +For GFX10: + +* Each agent has multiple shader arrays (SA). +* Each SA has multiple work-group processors (WGP). +* Each WGP has multiple compute units (CU). +* Each CU has multiple SIMDs that execute wavefronts. +* The wavefronts for a single work-group are executed in the same + WGP. In CU wavefront execution mode the wavefronts may be executed by + different SIMDs in the same CU. In WGP wavefront execution mode the + wavefronts may be executed by different SIMDs in different CUs in the same + WGP. +* Each WGP has a single LDS memory shared by the wavefronts of the work-groups + executing on it. +* All LDS operations of a WGP are performed as wavefront wide operations in a + global order and involve no caching. Completion is reported to a wavefront in + execution order. +* The LDS memory has multiple request queues shared by the SIMDs of a + WGP. Therefore, the LDS operations performed by different wavefronts of a + work-group can be reordered relative to each other, which can result in + reordering the visibility of vector memory operations with respect to LDS + operations of other wavefronts in the same work-group. A ``s_waitcnt + lgkmcnt(0)`` is required to ensure synchronization between LDS operations and + vector memory operations between wavefronts of a work-group, but not between + operations performed by the same wavefront. +* The vector memory operations are performed as wavefront wide operations. + Completion of load/store/sample operations are reported to a wavefront in + execution order of other load/store/sample operations performed by that + wavefront. +* The vector memory operations access a vector L0 cache. There is a single L0 + cache per CU. Each SIMD of a CU accesses the same L0 cache. Therefore, no + special action is required for coherence between the lanes of a single + wavefront. However, a ``buffer_gl0_inv`` is required for coherence between + wavefronts executing in the same work-group as they may be executing on SIMDs + of different CUs that access different L0s. A ``buffer_gl0_inv`` is also + required for coherence between wavefronts executing in different work-groups + as they may be executing on different WGPs. +* The scalar memory operations access a scalar L0 cache shared by all wavefronts + on a WGP. The scalar and vector L0 caches are not coherent. However, scalar + operations are used in a restricted way so do not impact the memory model. See + :ref:`amdgpu-amdhsa-memory-spaces`. +* The vector and scalar memory L0 caches use an L1 cache shared by all WGPs on + the same SA. Therefore, no special action is required for coherence between + the wavefronts of a single work-group. However, a ``buffer_gl1_inv`` is + required for coherence between wavefronts executing in different work-groups + as they may be executing on different SAs that access different L1s. +* The L1 caches have independent quadrants to service disjoint ranges of virtual + addresses. +* Each L0 cache has a separate request queue per L1 quadrant. Therefore, the + vector and scalar memory operations performed by different wavefronts, whether + executing in the same or different work-groups (which may be executing on + different CUs accessing different L0s), can be reordered relative to each + other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is required to ensure + synchronization between vector memory operations of different wavefronts. It + ensures a previous vector memory operation has completed before executing a + subsequent vector memory or LDS operation and so can be used to meet the + requirements of acquire, release and sequential consistency. +* The L1 caches use an L2 cache shared by all SAs on the same agent. +* The L2 cache has independent channels to service disjoint ranges of virtual + addresses. +* Each L1 quadrant of a single SA accesses a different L2 channel. Each L1 + quadrant has a separate request queue per L2 channel. Therefore, the vector + and scalar memory operations performed by wavefronts executing in different + work-groups (which may be executing on different SAs) of an agent can be + reordered relative to each other. A ``s_waitcnt vmcnt(0) & vscnt(0)`` is + required to ensure synchronization between vector memory operations of + different SAs. It ensures a previous vector memory operation has completed + before executing a subsequent vector memory and so can be used to meet the + requirements of acquire, release and sequential consistency. +* The L2 cache can be kept coherent with other agents on some targets, or ranges + of virtual addresses can be set up to bypass it to ensure system coherence. + +Scalar memory operations are only used to access memory that is proven to not +change during the execution of the kernel dispatch. This includes constant +address space and global address space for program scope ``const`` variables. +Therefore, the kernel machine code does not have to maintain the scalar cache to +ensure it is coherent with the vector caches. The scalar and vector caches are +invalidated between kernel dispatches by CP since constant address space data +may change between kernel dispatch executions. See +:ref:`amdgpu-amdhsa-memory-spaces`. + +The one exception is if scalar writes are used to spill SGPR registers. In this +case the AMDGPU backend ensures the memory location used to spill is never +accessed by vector memory operations at the same time. If scalar writes are used +then a ``s_dcache_wb`` is inserted before the ``s_endpgm`` and before a function +return since the locations may be used for vector memory instructions by a +future wavefront that uses the same scratch area, or a function call that +creates a frame at the same address, respectively. There is no need for a +``s_dcache_inv`` as all scalar writes are write-before-read in the same thread. + +For kernarg backing memory: + +* CP invalidates the L0 and L1 caches at the start of each kernel dispatch. +* On dGPU the kernarg backing memory is accessed as MTYPE UC (uncached) to avoid + needing to invalidate the L2 cache. +* On APU the kernarg backing memory is accessed as MTYPE CC (cache coherent) and + so the L2 cache will be coherent with the CPU and other agents. + +Scratch backing memory (which is used for the private address space) is accessed +with MTYPE NC (non-coherent). Since the private address space is only accessed +by a single thread, and is always write-before-read, there is never a need to +invalidate these entries from the L0 or L1 caches. + +Wavefronts are executed in native mode with in-order reporting of loads and +sample instructions. In this mode vmcnt reports completion of load, atomic with +return and sample instructions in order, and the vscnt reports the completion of +store and atomic without return in order. See ``MEM_ORDERED`` field in +:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. + +Wavefronts can be executed in WGP or CU wavefront execution mode: + +* In WGP wavefront execution mode the wavefronts of a work-group are executed + on the SIMDs of both CUs of the WGP. Therefore, explicit management of the per + CU L0 caches is required for work-group synchronization. Also accesses to L1 + at work-group scope need to be explicitly ordered as the accesses from + different CUs are not ordered. +* In CU wavefront execution mode the wavefronts of a work-group are executed on + the SIMDs of a single CU of the WGP. Therefore, all global memory access by + the work-group access the same L0 which in turn ensures L1 accesses are + ordered and so do not require explicit management of the caches for + work-group synchronization. + +See ``WGP_MODE`` field in +:ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table` and +:ref:`amdgpu-target-features`. + +The code sequences used to implement the memory model for GFX10 are defined in +table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`. + + .. table:: AMDHSA Memory Model Code Sequences GFX10 + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx10-table + + ============ ============ ============== ========== ================================ + LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code + Ordering Sync Scope Address GFX10 + Space + ============ ============ ============== ========== ================================ + **Non-Atomic** + ------------------------------------------------------------------------------------ + load *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_load + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_load + slc=1 + + - volatile + + 1. buffer/global/flat_load + glc=1 dlc=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + load *none* *none* - local 1. ds_load + store *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_store + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_store + slc=1 + + - volatile + + 1. buffer/global/flat_store + 2. s_waitcnt vscnt(0) + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + store *none* *none* - local 1. ds_store + **Unordered Atomic** + ------------------------------------------------------------------------------------ + load atomic unordered *any* *any* *Same as non-atomic*. + store atomic unordered *any* *any* *Same as non-atomic*. + atomicrmw unordered *any* *any* *Same as monotonic atomic*. + **Monotonic Atomic** + ------------------------------------------------------------------------------------ + load atomic monotonic - singlethread - global 1. buffer/global/flat_load + - wavefront - generic + load atomic monotonic - workgroup - global 1. buffer/global/flat_load + - generic glc=1 + + - If CU wavefront execution + mode, omit glc=1. + + load atomic monotonic - singlethread - local 1. ds_load + - wavefront + - workgroup + load atomic monotonic - agent - global 1. buffer/global/flat_load + - system - generic glc=1 dlc=1 + store atomic monotonic - singlethread - global 1. buffer/global/flat_store + - wavefront - generic + - workgroup + - agent + - system + store atomic monotonic - singlethread - local 1. ds_store + - wavefront + - workgroup + atomicrmw monotonic - singlethread - global 1. buffer/global/flat_atomic + - wavefront - generic + - workgroup + - agent + - system + atomicrmw monotonic - singlethread - local 1. ds_atomic + - wavefront + - workgroup + **Acquire Atomic** + ------------------------------------------------------------------------------------ + load atomic acquire - singlethread - global 1. buffer/global/ds/flat_load + - wavefront - local + - generic + load atomic acquire - workgroup - global 1. buffer/global_load glc=1 + + - If CU wavefront execution + mode, omit glc=1. + + 2. s_waitcnt vmcnt(0) + + - If CU wavefront execution + mode, omit. + - Must happen before + the following buffer_gl0_inv + and before any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + + 3. buffer_gl0_inv + + - If CU wavefront execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + load atomic acquire - workgroup - local 1. ds_load + 2. s_waitcnt lgkmcnt(0) + + - If OpenCL, omit. + - Must happen before + the following buffer_gl0_inv + and before any following + global/generic load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + 3. buffer_gl0_inv + + - If CU wavefront execution + mode, omit. + - If OpenCL, omit. + - Ensures that + following + loads will not see + stale data. + + load atomic acquire - workgroup - generic 1. flat_load glc=1 + + - If CU wavefront execution + mode, omit glc=1. + + 2. s_waitcnt lgkmcnt(0) & + vmcnt(0) + + - If CU wavefront execution + mode, omit vmcnt(0). + - If OpenCL, omit + lgkmcnt(0). + - Must happen before + the following + buffer_gl0_inv and any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local load + atomic value being + acquired. + + 3. buffer_gl0_inv + + - If CU wavefront execution + mode, omit. + - Ensures that + following + loads will not see + stale data. + + load atomic acquire - agent - global 1. buffer/global_load + - system glc=1 dlc=1 + 2. s_waitcnt vmcnt(0) + + - Must happen before + following + buffer_gl*_inv. + - Ensures the load + has completed + before invalidating + the caches. + + 3. buffer_gl0_inv; + buffer_gl1_inv + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale global data. + + load atomic acquire - agent - generic 1. flat_load glc=1 dlc=1 + - system 2. s_waitcnt vmcnt(0) & + lgkmcnt(0) + + - If OpenCL omit + lgkmcnt(0). + - Must happen before + following + buffer_gl*_invl. + - Ensures the flat_load + has completed + before invalidating + the caches. + + 3. buffer_gl0_inv; + buffer_gl1_inv + + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - singlethread - global 1. buffer/global/ds/flat_atomic + - wavefront - local - generic atomicrmw acquire - workgroup - global 1. buffer/global_atomic 2. s_waitcnt vm/vscnt(0) @@ -9099,6 +11549,8 @@ - *kernarg_segment_alignment*, *group_segment_alignment*, and *private_segment_alignment* default to 4. Note that alignments are specified as a power of 2, so a value of **n** means an alignment of 2^ **n**. +- *enable_tg_split* defaults to 1 if target feature ``tgsplit`` is enabled for + GFX90A onwards. - *enable_wgp_mode* defaults to 1 if target feature ``cumode`` is disabled for GFX10 onwards. - *enable_mem_ordered* defaults to 1 for GFX10 onwards. @@ -9312,6 +11764,9 @@ ``.amdhsa_next_free_sgpr`` Required GFX6-GFX10 Maximum SGPR number explicitly referenced, plus one. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. + ``.amdhsa_accum_offset`` Required GFX90A Offset of a first AccVGPR in the unified register file. + Used to calculate ACCUM_OFFSET in + :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. ``.amdhsa_reserve_vcc`` 1 GFX6-GFX10 Whether the kernel may use the special VCC SGPR. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. @@ -9345,6 +11800,10 @@ :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. ``.amdhsa_fp16_overflow`` 0 GFX9-GFX10 Controls FP16_OVFL in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx10-table`. + ``.amdhsa_tg_split`` Target GFX90A Controls TG_SPLIT in + Feature :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. + Specific + (tgsplit) ``.amdhsa_workgroup_processor_mode`` Target GFX10 Controls ENABLE_WGP_MODE in Feature :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. Specific Index: llvm/include/llvm/BinaryFormat/ELF.h =================================================================== --- llvm/include/llvm/BinaryFormat/ELF.h +++ llvm/include/llvm/BinaryFormat/ELF.h @@ -719,6 +719,7 @@ EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030, EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032, EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, @@ -733,7 +734,7 @@ // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX805, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1029,6 +1029,10 @@ // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; +// gfx90a intrinsics +def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic; + class AMDGPUStructBufferAtomic : Intrinsic < !if(NoRtn, [], [data_ty]), [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) @@ -1066,6 +1070,10 @@ // gfx908 intrinsic def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic; +// gfx90a intrinsics +def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic; + // Obsolescent tbuffer intrinsics. def int_amdgcn_tbuffer_load : Intrinsic < @@ -1995,6 +2003,65 @@ [IntrConvergent, IntrNoMem, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>]>; +//===----------------------------------------------------------------------===// +// gfx90a intrinsics +// ===----------------------------------------------------------------------===// + +def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn; +def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn; +def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn; +def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn; +def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn; + +def int_amdgcn_mfma_f32_32x32x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x4bf16_1k">, + Intrinsic<[llvm_v32f32_ty], + [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v32f32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f32_16x16x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x4bf16_1k">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v16f32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f32_4x4x4bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_4x4x4bf16_1k">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v4f32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f32_32x32x8bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_32x32x8bf16_1k">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v16f32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f32_16x16x16bf16_1k : GCCBuiltin<"__builtin_amdgcn_mfma_f32_16x16x16bf16_1k">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4i16_ty, llvm_v4i16_ty, llvm_v4f32_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f64_16x16x4f64 : GCCBuiltin<"__builtin_amdgcn_mfma_f64_16x16x4f64">, + Intrinsic<[llvm_v4f64_ty], + [llvm_double_ty, llvm_double_ty, llvm_v4f64_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + +def int_amdgcn_mfma_f64_4x4x4f64 : GCCBuiltin<"__builtin_amdgcn_mfma_f64_4x4x4f64">, + Intrinsic<[llvm_double_ty], + [llvm_double_ty, llvm_double_ty, llvm_double_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrNoMem, + ImmArg>, ImmArg>, ImmArg>]>; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. Index: llvm/include/llvm/Support/AMDHSAKernelDescriptor.h =================================================================== --- llvm/include/llvm/Support/AMDHSAKernelDescriptor.h +++ llvm/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -122,14 +122,27 @@ }; #undef COMPUTE_PGM_RSRC2 -// Compute program resource register 3. Must match hardware definition. -#define COMPUTE_PGM_RSRC3(NAME, SHIFT, WIDTH) \ - AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_ ## NAME, SHIFT, WIDTH) +// Compute program resource register 3 for GFX90A+. Must match hardware +// definition. +#define COMPUTE_PGM_RSRC3_GFX90A(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX90A_ ## NAME, SHIFT, WIDTH) enum : int32_t { - COMPUTE_PGM_RSRC3(SHARED_VGPR_COUNT, 0, 4), // GFX10+ - COMPUTE_PGM_RSRC3(RESERVED0, 4, 28), + COMPUTE_PGM_RSRC3_GFX90A(ACCUM_OFFSET, 0, 6), + COMPUTE_PGM_RSRC3_GFX90A(RESERVED0, 6, 10), + COMPUTE_PGM_RSRC3_GFX90A(TG_SPLIT, 16, 1), + COMPUTE_PGM_RSRC3_GFX90A(RESERVED1, 17, 15), }; -#undef COMPUTE_PGM_RSRC3 +#undef COMPUTE_PGM_RSRC3_GFX90A + +// Compute program resource register 3 for GFX10+. Must match hardware +// definition. +#define COMPUTE_PGM_RSRC3_GFX10(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_GFX10_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC3_GFX10(SHARED_VGPR_COUNT, 0, 4), // GFX10+ + COMPUTE_PGM_RSRC3_GFX10(RESERVED0, 4, 28), +}; +#undef COMPUTE_PGM_RSRC3_GFX10 // Kernel code properties. Must be kept backwards compatible. #define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \ @@ -155,7 +168,7 @@ uint8_t reserved0[8]; int64_t kernel_code_entry_byte_offset; uint8_t reserved1[20]; - uint32_t compute_pgm_rsrc3; // GFX10+ + uint32_t compute_pgm_rsrc3; // GFX10+ and GFX90A+ uint32_t compute_pgm_rsrc1; uint32_t compute_pgm_rsrc2; uint16_t kernel_code_properties; Index: llvm/include/llvm/Support/TargetParser.h =================================================================== --- llvm/include/llvm/Support/TargetParser.h +++ llvm/include/llvm/Support/TargetParser.h @@ -83,7 +83,8 @@ GK_GFX906 = 63, GK_GFX908 = 64, GK_GFX909 = 65, - GK_GFX90C = 66, + GK_GFX90A = 66, + GK_GFX90C = 67, GK_GFX1010 = 71, GK_GFX1011 = 72, Index: llvm/lib/Object/ELFObjectFile.cpp =================================================================== --- llvm/lib/Object/ELFObjectFile.cpp +++ llvm/lib/Object/ELFObjectFile.cpp @@ -457,6 +457,8 @@ return "gfx908"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: return "gfx909"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: + return "gfx90a"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: return "gfx90c"; Index: llvm/lib/ObjectYAML/ELFYAML.cpp =================================================================== --- llvm/lib/ObjectYAML/ELFYAML.cpp +++ llvm/lib/ObjectYAML/ELFYAML.cpp @@ -535,6 +535,7 @@ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX908, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH); Index: llvm/lib/Support/TargetParser.cpp =================================================================== --- llvm/lib/Support/TargetParser.cpp +++ llvm/lib/Support/TargetParser.cpp @@ -104,6 +104,7 @@ {{"gfx906"}, {"gfx906"}, GK_GFX906, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx908"}, {"gfx908"}, GK_GFX908, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, + {{"gfx90a"}, {"gfx90a"}, GK_GFX90A, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx90c"}, {"gfx90c"}, GK_GFX90C, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK}, {{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK}, @@ -213,6 +214,7 @@ case GK_GFX906: return {9, 0, 6}; case GK_GFX908: return {9, 0, 8}; case GK_GFX909: return {9, 0, 9}; + case GK_GFX90A: return {9, 0, 10}; case GK_GFX90C: return {9, 0, 12}; case GK_GFX1010: return {10, 1, 0}; case GK_GFX1011: return {10, 1, 1}; Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -51,6 +51,12 @@ "Most fp64 instructions are half rate instead of quarter" >; +def FullRate64Ops : SubtargetFeature<"full-rate-64-ops", + "FullRate64Ops", + "true", + "Most fp64 instructions are full rate" +>; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -148,6 +154,12 @@ "Enable XNACK support" >; +def FeatureTgSplit : SubtargetFeature<"tgsplit", + "EnableTgSplit", + "true", + "Enable threadgroup split execution" +>; + def FeatureCuMode : SubtargetFeature<"cumode", "EnableCuMode", "true", @@ -272,6 +284,12 @@ "Additional instructions for GFX9+" >; +def FeatureGFX90AInsts : SubtargetFeature<"gfx90a-insts", + "GFX90AInsts", + "true", + "Additional instructions for GFX90A+" +>; + def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", "GFX10Insts", "true", @@ -387,6 +405,18 @@ "Support DPP8 (Data Parallel Primitives) extension" >; +def Feature64BitDPP : SubtargetFeature<"dpp-64bit", + "Has64BitDPP", + "true", + "Support DPP (Data Parallel Primitives) extension" +>; + +def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops", + "HasPackedFP32Ops", + "true", + "Support packed fp32 instructions" +>; + def FeatureR128A16 : SubtargetFeature<"r128-a16", "HasR128A16", "true", @@ -411,6 +441,12 @@ "Support NSA encoding for image instructions" >; +def FeatureExtendedImageInsts : SubtargetFeature<"extended-image-insts", + "HasExtendedImageInsts", + "true", + "Support mips != 0, lod != 0, gather4, and get_lod" +>; + def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding", "GFX10_BEncoding", "true", @@ -659,6 +695,12 @@ " supports it" >; +def FeaturePackedTID : SubtargetFeature<"packed-tid", + "HasPackedTID", + "true", + "Workitem IDs are packed into v0 at kernel launch" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -675,7 +717,8 @@ [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, - FeatureTrigReducedRange] + FeatureTrigReducedRange, FeatureExtendedImageInsts + ] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -684,7 +727,8 @@ FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess + ] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -697,7 +741,9 @@ FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, - FeatureDsSrc2Insts, FeatureFastDenormalF32, FeatureUnalignedBufferAccess] + FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32, + FeatureUnalignedBufferAccess + ] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", @@ -712,9 +758,9 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, - FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, - FeatureFastDenormalF32, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureSupportsXNACK] + FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess + ] >; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", @@ -729,7 +775,7 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, - FeatureVOP3Literal, FeatureDPP8, + FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16, FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess @@ -816,17 +862,26 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_2 : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; def FeatureISAVersion9_0_4 : FeatureSet< [FeatureGFX9, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureFmaMixInsts, FeatureImageGather4D16Bug]>; @@ -835,6 +890,9 @@ HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, @@ -846,6 +904,9 @@ HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureDLInsts, FeatureDot1Insts, FeatureDot2Insts, @@ -864,13 +925,40 @@ [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureMadMacF32Insts, FeatureImageGather4D16Bug]>; +def FeatureISAVersion9_0_A : FeatureSet< + [FeatureGFX9, + FeatureGFX90AInsts, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + Feature64BitDPP, + FeaturePackedFP32Ops, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddInsts, + FeatureMadMacF32Insts, + FeatureSupportsSRAMECC, + FeaturePackedTID, + FullRate64Ops]>; + def FeatureISAVersion9_0_C : FeatureSet< [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, FeatureImageGather4D16Bug]>; // TODO: Organize more features into groups. @@ -1077,6 +1165,14 @@ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of (not FeatureGFX10Insts))>; +def isGFX6GFX7GFX8GFX9NotGFX90A : + Predicate<"!Subtarget->hasGFX90AInsts() &&" + "(Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, + AssemblerPredicate<(all_of (not FeatureGFX10Insts), (not FeatureGFX90AInsts))>; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<(all_of FeatureCIInsts)>; @@ -1097,6 +1193,28 @@ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>; +def isGCN3ExcludingGFX90A : + Predicate<"Subtarget->isGCN3Encoding() && !Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; + +def isGFX90APlus : + Predicate<"Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + +def isNotGFX90APlus : + Predicate<"!Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of (not FeatureGFX90AInsts))>; + +def isGFX8GFX9NotGFX90A : + Predicate<"!Subtarget->hasGFX90AInsts() &&" + "(Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + " Subtarget->getGeneration() == AMDGPUSubtarget::GFX9)">, + AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding, (not FeatureGFX90AInsts))>; + +def isGFX90AOnly : + Predicate<"Subtarget->hasGFX90AInsts()">, + AssemblerPredicate<(all_of FeatureGFX90AInsts)>; + def isGFX8GFX9 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, @@ -1177,6 +1295,15 @@ def HasDPP8 : Predicate<"Subtarget->hasDPP8()">, AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>; +def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">, + AssemblerPredicate<(all_of Feature64BitDPP)>; + +def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">, + AssemblerPredicate<(all_of FeaturePackedFP32Ops)>; + +def HasExtendedImageInsts : Predicate<"Subtarget->hasExtendedImageInsts()">, + AssemblerPredicate<(all_of FeatureExtendedImageInsts)>; + def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, AssemblerPredicate<(all_of FeatureR128A16)>; Index: llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -143,7 +143,8 @@ // Input registers for non-HSA ABI ArgDescriptor ImplicitBufferPtr; - // VGPRs inputs. These are always v0, v1 and v2 for entry functions. + // VGPRs inputs. For entry functions these are either v0, v1 and v2 or packed + // into v0, 10 bits per dimension if packed-tid is set. ArgDescriptor WorkItemIDX; ArgDescriptor WorkItemIDY; ArgDescriptor WorkItemIDZ; Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -328,11 +328,11 @@ // causing stale data in caches. Arguably this should be done by the linker, // which is why this isn't done for Mesa. const MCSubtargetInfo &STI = *getGlobalSTI(); - if (AMDGPU::isGFX10Plus(STI) && + if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && (STI.getTargetTriple().getOS() == Triple::AMDHSA || STI.getTargetTriple().getOS() == Triple::AMDPAL)) { OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer()->EmitCodeEnd(); + getTargetStreamer()->EmitCodeEnd(STI); } return AsmPrinter::doFinalization(M); @@ -400,6 +400,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( const MachineFunction &MF, const SIProgramInfo &PI) const { + const GCNSubtarget &STM = MF.getSubtarget(); amdhsa::kernel_descriptor_t KernelDescriptor; memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); @@ -413,6 +414,11 @@ KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); + assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); + if (STM.hasGFX90AInsts()) + KernelDescriptor.compute_pgm_rsrc3 = + CurrentProgramInfo.ComputePGMRSrc3GFX90A; + return KernelDescriptor; } @@ -521,6 +527,11 @@ " NumVGPRsForWavesPerEU: " + Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); + if (STM.hasGFX90AInsts()) + OutStreamer->emitRawComment( + " AccumOffset: " + + Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false); + OutStreamer->emitRawComment( " Occupancy: " + Twine(CurrentProgramInfo.Occupancy), false); @@ -550,6 +561,21 @@ " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), false); + + assert(STM.hasGFX90AInsts() || + CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); + if (STM.hasGFX90AInsts()) { + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + + Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), + false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + + Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), + false); + } } if (DumpCodeInstEmitter) { @@ -612,6 +638,8 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs( const GCNSubtarget &ST) const { + if (ST.hasGFX90AInsts() && NumAGPR) + return alignTo(NumVGPR, 4) + NumAGPR; return std::max(NumVGPR, NumAGPR); } @@ -985,6 +1013,8 @@ ProgInfo.NumArchVGPR = Info.NumVGPR; ProgInfo.NumAccVGPR = Info.NumAGPR; ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); + ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; + ProgInfo.TgSplit = STM.isTgSplitEnabled(); ProgInfo.NumSGPR = Info.NumExplicitSGPR; ProgInfo.ScratchSize = Info.PrivateSegmentSize; ProgInfo.VCCUsed = Info.UsesVCC; @@ -1163,6 +1193,15 @@ S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | S_00B84C_EXCP_EN(0); + if (STM.hasGFX90AInsts()) { + AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + ProgInfo.AccumOffset); + AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + ProgInfo.TgSplit); + } + ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU); Index: llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -163,6 +163,10 @@ (sequence "VGPR%u", 248, 255)) >; +def CSR_AMDGPU_AGPRs_32_255 : CalleeSavedRegs< + (sequence "AGPR%u", 32, 255) +>; + def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs< (sequence "SGPR%u", 32, 105) >; @@ -172,6 +176,13 @@ (sequence "VGPR%u", 0, 255) >; +def CSR_AMDGPU_AllAGPRs : CalleeSavedRegs< + (sequence "AGPR%u", 0, 255) +>; +def CSR_AMDGPU_AllVectorRegs : CalleeSavedRegs< + (add CSR_AMDGPU_AllVGPRs, CSR_AMDGPU_AllAGPRs) +>; + // Just to get the regmask, not for calling convention purposes. def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs< (add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI) @@ -181,6 +192,10 @@ (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105) >; +def CSR_AMDGPU_HighRegs_With_AGPRs : CalleeSavedRegs< + (add CSR_AMDGPU_HighRegs, CSR_AMDGPU_AGPRs_32_255) +>; + def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>; // Calling convention for leaf functions Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -219,6 +219,8 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; @@ -338,5 +340,8 @@ def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">, GISDNodeXFormEquiv; +def gi_extract_sccb : GICustomOperandRenderer<"renderExtractSCCB">, + GISDNodeXFormEquiv; + def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">, GISDNodeXFormEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -189,11 +189,12 @@ bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ, + SDValue &SCCB) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const; + SDValue &SWZ, SDValue &SCCB) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -206,7 +207,8 @@ bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ, + SDValue &SCCB) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, @@ -322,6 +324,16 @@ // Figure out if this is really an extract of the high 16-bits of a dword. static bool isExtractHiElt(SDValue In, SDValue &Out) { In = stripBitcast(In); + + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast(In.getOperand(1))) { + if (!Idx->isOne()) + return false; + Out = In.getOperand(0); + return true; + } + } + if (In.getOpcode() != ISD::TRUNCATE) return false; @@ -341,6 +353,13 @@ // Look through operations that obscure just looking at the low 16-bits of the // same register. static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast(In.getOperand(1))) { + if (Idx->isNullValue() && In.getValueSizeInBits() <= 32) + return In.getOperand(0); + } + } + if (In.getOpcode() == ISD::TRUNCATE) { SDValue Src = In.getOperand(0); if (Src.getValueType().getSizeInBits() == 32) @@ -1380,7 +1399,7 @@ SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { + SDValue &SWZ, SDValue &SCCB) const { // Subtarget prefers to use flat instruction // FIXME: This should be a pattern predicate and not reach here if (Subtarget->useFlatForGlobal()) @@ -1395,6 +1414,7 @@ TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); + SCCB = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1474,7 +1494,8 @@ SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC, SDValue &SWZ) const { + SDValue &DLC, SDValue &SWZ, + SDValue &SCCB) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1483,7 +1504,7 @@ return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + GLC, SLC, TFE, DLC, SWZ, SCCB)) return false; ConstantSDNode *C = cast(Addr64); @@ -1505,9 +1526,9 @@ SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC, SWZ; + SDValue GLC, TFE, DLC, SWZ, SCCB; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1631,13 +1652,13 @@ SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { + SDValue &SWZ, SDValue &SCCB) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + GLC, SLC, TFE, DLC, SWZ, SCCB)) return false; if (!cast(Offen)->getSExtValue() && @@ -1659,16 +1680,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC, SWZ; + SDValue GLC, SLC, TFE, DLC, SWZ, SCCB; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC, SWZ; + SDValue GLC, TFE, DLC, SWZ, SCCB; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB); } // Find a load or store from corresponding pattern root. @@ -2773,18 +2794,62 @@ if (isExtractHiElt(Hi, Hi)) Mods |= SISrcMods::OP_SEL_1; + unsigned VecSize = Src.getValueSizeInBits(); Lo = stripExtractLoElt(Lo); Hi = stripExtractLoElt(Hi); + if (Lo.getValueSizeInBits() > VecSize) { + Lo = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Lo); + } + + if (Hi.getValueSizeInBits() > VecSize) { + Hi = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Hi); + } + + assert(Lo.getValueSizeInBits() <= VecSize && + Hi.getValueSizeInBits() <= VecSize); + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { // Really a scalar input. Just select from the low half of the register to // avoid packing. - Src = Lo; + if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { + Src = Lo; + } else { + assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); + + SDLoc SL(In); + SDValue Undef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, + Lo.getValueType()), 0); + auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID + : AMDGPU::SReg_64RegClassID; + const SDValue Ops[] = { + CurDAG->getTargetConstant(RC, SL, MVT::i32), + Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) }; + + Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, + Src.getValueType(), Ops), 0); + } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } + if (VecSize == 64 && Lo == Hi && isa(Lo)) { + uint64_t Lit = cast(Lo)->getValueAPF() + .bitcastToAPInt().getZExtValue(); + if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) { + Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + } + Mods = VecMods; } Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -523,6 +523,8 @@ BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_FMIN, + BUFFER_ATOMIC_FMAX, LAST_AMDGPU_ISD_NUMBER }; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4345,6 +4345,8 @@ NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -291,6 +291,9 @@ int OpIdx) const; void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderExtractSCCB(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1456,7 +1456,7 @@ } static bool parseCachePolicy(uint64_t Value, - bool *GLC, bool *SLC, bool *DLC) { + bool *GLC, bool *SLC, bool *DLC, bool *SCC) { if (GLC) { *GLC = (Value & 0x1) ? 1 : 0; Value &= ~(uint64_t)0x1; @@ -1469,6 +1469,10 @@ *DLC = (Value & 0x4) ? 1 : 0; Value &= ~(uint64_t)0x4; } + if (SCC) { + *SCC = (Value & 0x10) ? 1 : 0; + Value &= ~(uint64_t)0x10; + } return Value == 0; } @@ -1601,16 +1605,17 @@ bool GLC = false; bool SLC = false; bool DLC = false; + bool SCC = false; if (BaseOpcode->Atomic) { GLC = true; // TODO no-return optimization if (!parseCachePolicy( MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr, - &SLC, IsGFX10Plus ? &DLC : nullptr)) + &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC)) return false; } else { if (!parseCachePolicy( MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC, - &SLC, IsGFX10Plus ? &DLC : nullptr)) + &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC)) return false; } @@ -1700,6 +1705,8 @@ MIB.addImm(Unorm); if (IsGFX10Plus) MIB.addImm(DLC); + else + MIB.addImm(SCC); MIB.addImm(GLC); MIB.addImm(SLC); @@ -2904,6 +2911,8 @@ bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD( MachineInstr &MI) const { + if (STI.hasGFX90AInsts()) + return selectImpl(MI, *CoverageInfo); MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2988,6 +2997,9 @@ bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( MachineInstr &MI) const{ + if (STI.hasGFX90AInsts()) + return selectImpl(MI, *CoverageInfo); + MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -3013,6 +3025,7 @@ .addReg(Data) .addImm(Addr.second) .addImm(0) // SLC + .addImm(0) // SSCB .cloneMemRefs(MI); MI.eraseFromParent(); @@ -4143,7 +4156,8 @@ addZeroImm, // slc addZeroImm, // tfe addZeroImm, // dlc - addZeroImm // swz + addZeroImm, // swz + addZeroImm // scc }}; } @@ -4171,7 +4185,8 @@ addZeroImm, // slc addZeroImm, // tfe addZeroImm, // dlc - addZeroImm // swz + addZeroImm, // swz + addZeroImm // scc }}; } @@ -4345,6 +4360,13 @@ MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1); } +void AMDGPUInstructionSelector::renderExtractSCCB(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm((MI.getOperand(OpIdx).getImm() >> 4) & 1); +} + void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1287,8 +1287,10 @@ } if (ST.hasLDSFPAtomics()) { - getActionDefinitionsBuilder(G_ATOMICRMW_FADD) + auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); + if (ST.hasGFX90AInsts()) + Atomic.legalFor({{S64, LocalPtr}}); } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output @@ -3903,9 +3905,16 @@ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + case Intrinsic::amdgcn_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; default: llvm_unreachable("unhandled atomic opcode"); } @@ -4815,6 +4824,11 @@ case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::amdgcn_atomic_inc: return legalizeAtomicIncDec(MI, B, true); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1714,6 +1714,15 @@ return (CachePolicy >> 2) & 1; } +static unsigned extractSWZ(unsigned CachePolicy) { + return (CachePolicy >> 3) & 1; +} + +static unsigned extractSCCB(unsigned CachePolicy) { + return (CachePolicy >> 4) & 1; +} + + MachineInstr * AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, MachineInstr &MI) const { @@ -1782,6 +1791,8 @@ .addImm(extractSLC(CachePolicy)) .addImm(0) // tfe: FIXME: Remove from inst .addImm(extractDLC(CachePolicy)) + .addImm(extractSWZ(CachePolicy)) + .addImm(extractSCCB(CachePolicy)) .cloneMemRefs(MI); // FIXME: We need a way to report failure from applyMappingImpl. @@ -2839,7 +2850,9 @@ executeInWaterfallLoop(MI, MRI, {2, 5}); return; } - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, {2, 5}); return; @@ -3807,7 +3820,9 @@ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: - case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: { + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); @@ -4064,7 +4079,14 @@ case Intrinsic::amdgcn_mfma_i32_32x32x4i8: case Intrinsic::amdgcn_mfma_i32_32x32x8i8: case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: - case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: { + case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: + case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: + case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: + case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: + case Intrinsic::amdgcn_mfma_f64_16x16x4f64: + case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src @@ -4138,6 +4160,11 @@ } case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -199,6 +199,12 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -226,6 +232,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -240,6 +248,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -275,6 +285,13 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -45,6 +45,7 @@ Triple TargetTriple; protected: + bool GCN3Encoding; bool Has16BitInsts; bool HasMadMixInsts; bool HasMadMacF32Insts; @@ -124,6 +125,10 @@ return TargetTriple.getArch() == Triple::amdgcn; } + bool isGCN3Encoding() const { + return GCN3Encoding; + } + bool has16BitInsts() const { return Has16BitInsts; } Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -176,6 +176,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT), + GCN3Encoding(false), Has16BitInsts(false), HasMadMixInsts(false), HasMadMacF32Insts(false), @@ -207,6 +208,7 @@ FastFMAF32(false), FastDenormalF32(false), HalfRate64Ops(false), + FullRate64Ops(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), @@ -216,6 +218,7 @@ HasApertureRegs(false), SupportsXNACK(false), EnableXNACK(false), + EnableTgSplit(false), EnableCuMode(false), TrapHandler(false), @@ -227,10 +230,10 @@ DumpCode(false), FP64(false), - GCN3Encoding(false), CIInsts(false), GFX8Insts(false), GFX9Insts(false), + GFX90AInsts(false), GFX10Insts(false), GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), @@ -249,6 +252,9 @@ HasSDWAOutModsVOPC(false), HasDPP(false), HasDPP8(false), + Has64BitDPP(false), + HasPackedFP32Ops(false), + HasExtendedImageInsts(false), HasR128A16(false), HasGFX10A16(false), HasG16(false), @@ -284,6 +290,7 @@ HasMFMAInlineLiteralBug(false), UnalignedBufferAccess(false), UnalignedDSAccess(false), + HasPackedTID(false), ScalarizeGlobal(false), @@ -776,6 +783,9 @@ unsigned Requested = AMDGPU::getIntegerAttribute( F, "amdgpu-num-vgpr", MaxNumVGPRs); + if (hasGFX90AInsts()) + Requested *= 2; + // Make sure requested value is compatible with values implied by // default/requested minimum/maximum number of waves per execution unit. if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -311,7 +311,7 @@ } unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { - return 32; + return (Vector && ST->hasPackedFP32Ops()) ? 64 : 32; } unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { @@ -321,7 +321,9 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { if (Opcode == Instruction::Load || Opcode == Instruction::Store) return 32 * 4 / ElemWidth; - return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 + : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2 + : 1; } unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, @@ -628,6 +630,8 @@ LLVM_FALLTHROUGH; case ISD::FADD: case ISD::FSUB: + if (ST->hasPackedFP32Ops() && SLT == MVT::f32) + NElts = (NElts + 1) / 2; if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); @@ -779,7 +783,8 @@ if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); - if (ST->has16BitInsts() && SLT == MVT::f16) + if ((ST->has16BitInsts() && SLT == MVT::f16) || + (ST->hasPackedFP32Ops() && SLT == MVT::f32)) NElts = (NElts + 1) / 2; // TODO: Get more refined intrinsic costs? @@ -1192,8 +1197,10 @@ } int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { - return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) - : getQuarterRateInstrCost(CostKind); + return ST->hasFullRate64Ops() + ? getFullRateInstrCost() + : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) Index: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" @@ -114,6 +115,7 @@ ImmTyOffset0, ImmTyOffset1, ImmTyDLC, + ImmTySCCB, ImmTyGLC, ImmTySLC, ImmTySWZ, @@ -299,6 +301,8 @@ return isRegKind() && getReg() == AMDGPU::SGPR_NULL; } + bool isVRegWithInputMods() const; + bool isSDWAOperand(MVT type) const; bool isSDWAFP16Operand() const; bool isSDWAFP32Operand() const; @@ -337,6 +341,7 @@ bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isDLC() const { return isImmTy(ImmTyDLC); } + bool isSCCB() const { return isImmTy(ImmTySCCB); } bool isGLC() const { return isImmTy(ImmTyGLC); } // "GLC_1" is a MatchClass of the GLC_1 operand with the default and forced // value of the GLC operand. @@ -449,6 +454,26 @@ return isSSrcF16(); } + bool isSSrcV2FP32() const { + llvm_unreachable("cannot happen"); + return isSSrcF32(); + } + + bool isSCSrcV2FP32() const { + llvm_unreachable("cannot happen"); + return isSCSrcF32(); + } + + bool isSSrcV2INT32() const { + llvm_unreachable("cannot happen"); + return isSSrcB32(); + } + + bool isSCSrcV2INT32() const { + llvm_unreachable("cannot happen"); + return isSCSrcB32(); + } + bool isSSrcOrLdsB32() const { return isRegOrInlineNoMods(AMDGPU::SRegOrLds_32RegClassID, MVT::i32) || isLiteralImm(MVT::i32) || isExpr(); @@ -502,6 +527,22 @@ return isVSrcB16() || isLiteralImm(MVT::v2i16); } + bool isVCSrcV2FP32() const { + return isVCSrcF64(); + } + + bool isVSrcV2FP32() const { + return isVSrcF64() || isLiteralImm(MVT::v2f32); + } + + bool isVCSrcV2INT32() const { + return isVCSrcB64(); + } + + bool isVSrcV2INT32() const { + return isVSrcB64() || isLiteralImm(MVT::v2i32); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr(); } @@ -542,6 +583,102 @@ return isVISrcF16() || isVISrcB32(); } + bool isVISrc_64B64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64); + } + + bool isVISrc_64F64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f64); + } + + bool isVISrc_64V2FP32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f32); + } + + bool isVISrc_64V2INT32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32); + } + + bool isVISrc_256B64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64); + } + + bool isVISrc_256F64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64); + } + + bool isVISrc_128B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16); + } + + bool isVISrc_128V2B16() const { + return isVISrc_128B16(); + } + + bool isVISrc_128B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i32); + } + + bool isVISrc_128F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f32); + } + + bool isVISrc_256V2FP32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32); + } + + bool isVISrc_256V2INT32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32); + } + + bool isVISrc_512B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i32); + } + + bool isVISrc_512B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::i16); + } + + bool isVISrc_512V2B16() const { + return isVISrc_512B16(); + } + + bool isVISrc_512F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f32); + } + + bool isVISrc_512F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f16); + } + + bool isVISrc_512V2F16() const { + return isVISrc_512F16() || isVISrc_512B32(); + } + + bool isVISrc_1024B32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i32); + } + + bool isVISrc_1024B16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::i16); + } + + bool isVISrc_1024V2B16() const { + return isVISrc_1024B16(); + } + + bool isVISrc_1024F32() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f32); + } + + bool isVISrc_1024F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_1024RegClassID, MVT::f16); + } + + bool isVISrc_1024V2F16() const { + return isVISrc_1024F16() || isVISrc_1024B32(); + } + bool isAISrcB32() const { return isRegOrInlineNoMods(AMDGPU::AGPR_32RegClassID, MVT::i32); } @@ -566,6 +703,14 @@ return isAISrcF16() || isAISrcB32(); } + bool isAISrc_64B64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::i64); + } + + bool isAISrc_64F64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_64RegClassID, MVT::f64); + } + bool isAISrc_128B32() const { return isRegOrInlineNoMods(AMDGPU::AReg_128RegClassID, MVT::i32); } @@ -590,6 +735,22 @@ return isAISrc_128F16() || isAISrc_128B32(); } + bool isVISrc_128F16() const { + return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16); + } + + bool isVISrc_128V2F16() const { + return isVISrc_128F16() || isVISrc_128B32(); + } + + bool isAISrc_256B64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::i64); + } + + bool isAISrc_256F64() const { + return isRegOrInlineNoMods(AMDGPU::AReg_256RegClassID, MVT::f64); + } + bool isAISrc_512B32() const { return isRegOrInlineNoMods(AMDGPU::AReg_512RegClassID, MVT::i32); } @@ -838,6 +999,7 @@ case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; case ImmTyDLC: OS << "DLC"; break; + case ImmTySCCB: OS << "SCCB"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTySWZ: OS << "SWZ"; break; @@ -1197,6 +1359,10 @@ return AMDGPU::isGFX9(getSTI()); } + bool isGFX90A() const { + return AMDGPU::isGFX90A(getSTI()); + } + bool isGFX9Plus() const { return AMDGPU::isGFX9Plus(getSTI()); } @@ -1384,6 +1550,8 @@ bool validateVccOperand(unsigned Reg) const; bool validateVOP3Literal(const MCInst &Inst, const OperandVector &Operands); bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands); + bool validateAGPRLdSt(const MCInst &Inst) const; + bool validateVGPRAlign(const MCInst &Inst) const; bool validateDivScale(const MCInst &Inst); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc); @@ -1459,6 +1627,7 @@ void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); AMDGPUOperand::Ptr defaultDLC() const; + AMDGPUOperand::Ptr defaultSCCB() const; AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultGLC_1() const; AMDGPUOperand::Ptr defaultSLC() const; @@ -1553,11 +1722,16 @@ case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: return &APFloat::IEEEsingle(); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return &APFloat::IEEEdouble(); case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: @@ -1717,7 +1891,8 @@ // literal goes into the lower half and the upper half is zero. We also // require that the literal may be losslesly converted to f16. MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : - (type == MVT::v2i16)? MVT::i16 : type; + (type == MVT::v2i16)? MVT::i16 : + (type == MVT::v2f32)? MVT::f32 : type; APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -1727,6 +1902,13 @@ return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } +bool AMDGPUOperand::isVRegWithInputMods() const { + return isRegClass(AMDGPU::VGPR_32RegClassID) || + // GFX90A allows DPP on 64-bit operands. + (isRegClass(AMDGPU::VReg_64RegClassID) && + AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]); +} + bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg32(); @@ -1808,6 +1990,7 @@ case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1851,7 +2034,11 @@ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble(), Literal); // Convert literal to single precision @@ -1883,6 +2070,10 @@ case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: if (isSafeTruncation(Val, 32) && AMDGPU::isInlinableLiteral32(static_cast(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1899,6 +2090,7 @@ case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); setImmKindConst(); @@ -3202,7 +3394,7 @@ return true; unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); - unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0; + unsigned TFESize = (TFEIdx != -1 && Inst.getOperand(TFEIdx).getImm()) ? 1 : 0; unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) DMask = 1; @@ -3804,6 +3996,79 @@ return true; } +// Returns -1 if not a register, 0 if VGPR and 1 if AGPR. +static int IsAGPROperand(const MCInst &Inst, uint16_t NameIdx, + const MCRegisterInfo *MRI) { + int OpIdx = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), NameIdx); + if (OpIdx < 0) + return -1; + + const MCOperand &Op = Inst.getOperand(OpIdx); + if (!Op.isReg()) + return -1; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + auto Reg = Sub ? Sub : Op.getReg(); + const MCRegisterClass &AGRP32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); + return AGRP32.contains(Reg) ? 1 : 0; +} + +bool AMDGPUAsmParser::validateAGPRLdSt(const MCInst &Inst) const { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if ((TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF | + SIInstrFlags::MTBUF | SIInstrFlags::MIMG | + SIInstrFlags::DS)) == 0) + return true; + + uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + + const MCRegisterInfo *MRI = getMRI(); + int DstAreg = IsAGPROperand(Inst, AMDGPU::OpName::vdst, MRI); + int DataAreg = IsAGPROperand(Inst, DataNameIdx, MRI); + + if ((TSFlags & SIInstrFlags::DS) && DataAreg >= 0) { + int Data2Areg = IsAGPROperand(Inst, AMDGPU::OpName::data1, MRI); + if (Data2Areg >= 0 && Data2Areg != DataAreg) + return false; + } + + auto FB = getFeatureBits(); + if (FB[AMDGPU::FeatureGFX90AInsts]) { + if (DataAreg < 0 || DstAreg < 0) + return true; + return DstAreg == DataAreg; + } + + return DstAreg < 1 && DataAreg < 1; +} + +bool AMDGPUAsmParser::validateVGPRAlign(const MCInst &Inst) const { + auto FB = getFeatureBits(); + if (!FB[AMDGPU::FeatureGFX90AInsts]) + return true; + + const MCRegisterInfo *MRI = getMRI(); + const MCRegisterClass &VGRP32 = MRI->getRegClass(AMDGPU::VGPR_32RegClassID); + const MCRegisterClass &AGRP32 = MRI->getRegClass(AMDGPU::AGPR_32RegClassID); + for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { + const MCOperand &Op = Inst.getOperand(I); + if (!Op.isReg()) + continue; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + if (!Sub) + continue; + + if (VGRP32.contains(Sub) && ((Sub - AMDGPU::VGPR0) & 1)) + return false; + if (AGRP32.contains(Sub) && ((Sub - AMDGPU::AGPR0) & 1)) + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, const SMLoc &IDLoc) { @@ -3895,6 +4160,23 @@ if (!validateMAIAccWrite(Inst, Operands)) { return false; } + if (!validateCoherencyBits(Inst, Operands, IDLoc)) { + return false; + } + + if (!validateAGPRLdSt(Inst)) { + Error(IDLoc, getFeatureBits()[AMDGPU::FeatureGFX90AInsts] + ? "invalid register class: data and dst should be all VGPR or AGPR" + : "invalid register class: agpr loads and stores not supported on this GPU" + ); + return false; + } + if (!validateVGPRAlign(Inst)) { + Error(IDLoc, + "invalid register class: vgpr tuples must be 64 bit aligned"); + return false; + } + if (!validateDivScale(Inst)) { Error(IDLoc, "ABS not allowed in VOP3B instructions"); return false; @@ -4145,6 +4427,7 @@ SMRange VGPRRange; uint64_t NextFreeVGPR = 0; + uint64_t AccumOffset = 0; SMRange SGPRRange; uint64_t NextFreeSGPR = 0; unsigned UserSGPRCount = 0; @@ -4273,6 +4556,10 @@ } else if (ID == ".amdhsa_next_free_sgpr") { SGPRRange = ValRange; NextFreeSGPR = Val; + } else if (ID == ".amdhsa_accum_offset") { + if (!isGFX90A()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + AccumOffset = Val; } else if (ID == ".amdhsa_reserve_vcc") { if (!isUInt<1>(Val)) return OutOfRangeError(ValRange); @@ -4313,6 +4600,11 @@ return Error(IDRange.Start, "directive requires gfx9+", IDRange); PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, ValRange); + } else if (ID == ".amdhsa_tg_split") { + if (!isGFX90A()) + return Error(IDRange.Start, "directive requires gfx90a+", IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Val, + ValRange); } else if (ID == ".amdhsa_workgroup_processor_mode") { if (IVersion.Major < 10) return Error(IDRange.Start, "directive requires gfx10+", IDRange); @@ -4397,6 +4689,18 @@ AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, UserSGPRCount); + if (isGFX90A()) { + if (Seen.find(".amdhsa_accum_offset") == Seen.end()) + return TokError(".amdhsa_accum_offset directive is required"); + if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3)) + return TokError("accum_offset should be in range [4..256] in " + "increments of 4"); + if (AccumOffset > alignTo(std::max((uint64_t)1, NextFreeVGPR), 4)) + return TokError("accum_offset exceeds total VGPR allocation"); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, + (AccumOffset / 4 - 1)); + } + getTargetStreamer().EmitAmdhsaKernelDescriptor( getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, ReserveFlatScr, ReserveXNACK); @@ -5075,6 +5379,8 @@ Error(S, "dlc modifier is not supported on this GPU"); return MatchOperand_ParseFail; } + if (!isGFX90A() && ImmTy == AMDGPUOperand::ImmTySCCB) + return MatchOperand_ParseFail; if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) ImmTy = AMDGPUOperand::ImmTyR128A16; @@ -6504,6 +6810,10 @@ return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDLC); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSCCB() const { + return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySCCB); +} + AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyGLC); } @@ -6586,8 +6896,9 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } - if (isGFX10Plus()) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySCCB); } void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { @@ -6625,9 +6936,9 @@ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); - - if (isGFX10Plus()) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySCCB); } //===----------------------------------------------------------------------===// @@ -6669,14 +6980,21 @@ if (IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::sccb) != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTySCCB); + if (IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::tfe) != -1) + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); if (IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); if (!IsGFX10Plus) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); @@ -6784,6 +7102,7 @@ {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, {"dlc", AMDGPUOperand::ImmTyDLC, true, nullptr}, + {"scc", AMDGPUOperand::ImmTySCCB, true, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, @@ -7021,6 +7340,7 @@ Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 || Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F16_e64_vi || + Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_F32_e64_vi || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || @@ -7276,6 +7596,15 @@ bool AMDGPUAsmParser::isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands) { + if (Ctrl == "row_newbcast") + return isGFX90A(); + + // DPP64 is supported for row_newbcast only. + const MCRegisterInfo *MRI = getMRI(); + if (Operands.size() > 2 && Operands[1]->isReg() && + MRI->getSubReg(Operands[1]->getReg(), AMDGPU::sub1)) + return false; + if (Ctrl == "row_share" || Ctrl == "row_xmask") return isGFX10Plus(); @@ -7353,6 +7682,7 @@ .Case("row_ror", {DppCtrl::ROW_ROR0, 1, 15}) .Case("row_share", {DppCtrl::ROW_SHARE_FIRST, 0, 15}) .Case("row_xmask", {DppCtrl::ROW_XMASK_FIRST, 0, 15}) + .Case("row_newbcast", {DppCtrl::ROW_NEWBCAST_FIRST, 0, 15}) .Default({-1, 0, 0}); bool Valid; Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; @@ -105,6 +105,8 @@ bits<1> has_slc = 1; bits<1> has_tfe = 1; bits<4> elements = 0; + bits<1> has_sccb = 1; + bits<1> sccb_value = 0; } class MTBUF_Real : @@ -126,7 +128,7 @@ bits<1> dlc; bits<7> format; bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> srsrc; bits<1> slc; bits<1> tfe; @@ -134,25 +136,31 @@ bits<4> dfmt = format{3-0}; bits<3> nfmt = format{6-4}; + + bits<1> sccb; + // GFX90A+ only: instruction uses AccVGPR for data + // Bit superceedes tfe. + bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } class getMTBUFInsDA vdataList, list vaddrList=[]> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand.ret; dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb) ); dag InsData = !if(!empty(vaddrList), - (ins vdataClass:$vdata, SReg_128:$srsrc, + (ins vdata_op:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), - (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb), + (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -202,9 +210,9 @@ // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo.ret:$vdata), getMTBUFIns.ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz$sccb", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -252,7 +260,7 @@ : MTBUF_Pseudo.ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz$sccb", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -341,6 +349,8 @@ bits<1> has_slc = 1; bits<1> has_tfe = 1; bits<4> elements = 0; + bits<1> has_sccb = 1; + bits<1> sccb_value = 0; } class MUBUF_Real : @@ -362,11 +372,16 @@ bits<1> glc; bits<1> dlc; bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> srsrc; bits<1> slc; bits<1> tfe; bits<8> soffset; + + bits<1> sccb; + // GFX90A+ only: instruction uses AccVGPR for data + // Bit superceedes tfe. + bits<1> acc = !if(ps.has_vdata, vdata{9}, 0); } @@ -395,6 +410,8 @@ let has_offset = 0; let has_slc = 0; let has_tfe = 0; + let has_sccb = 0; + let sccb_value = 0; } class getMUBUFInsDA vdataList, @@ -402,6 +419,7 @@ bit isLds = 0> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand.ret; dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), @@ -409,14 +427,14 @@ offset:$offset, GLC:$glc, SLC:$slc) ); dag InsData = !if(!empty(vaddrList), - (ins vdataClass:$vdata, SReg_128:$srsrc, + (ins vdata_op:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), - (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, + (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc) ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz, SCCB_0:$sccb), (ins TFE:$tfe, DLC:$dlc, SWZ:$swz, SCCB_0:$sccb)) ); } @@ -482,13 +500,15 @@ bit isLds = 0, list pattern=[], // Workaround bug bz30254 - int addrKindCopy = addrKind> + int addrKindCopy = addrKind, + RegisterClass vdata_rc = getVregSrcForVT.ret, + RegisterOperand vdata_op = getLdStRegisterOperand.ret> : MUBUF_Pseudo.ret:$vdata), + (outs vdata_op:$vdata), !con(getMUBUFIns.ret, - !if(HasTiedDest, (ins getVregSrcForVT.ret:$vdata_in), (ins))), + !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc$swz", + !if(isLds, " lds", "$tfe") # "$dlc$swz$sccb", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -567,7 +587,7 @@ : MUBUF_Pseudo.ret]>.ret, - " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", + " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz$sccb", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -608,8 +628,8 @@ class MUBUF_Pseudo_Store_Lds : MUBUF_Pseudo { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz, SCCB_0:$sccb), + " $srsrc, $soffset$offset lds$glc$slc$swz$sccb"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -626,17 +646,18 @@ class getMUBUFAtomicInsDA vaddrList=[]> { RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); + RegisterOperand vdata_op = getLdStRegisterOperand.ret; dag ret = !if(vdata_in, !if(!empty(vaddrList), - (ins vdataClass:$vdata_in, + (ins vdata_op:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc), - (ins vdataClass:$vdata_in, vaddrClass:$vaddr, + (ins vdata_op:$vdata_in, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC_1:$glc1, SLC:$slc) ), !if(!empty(vaddrList), - (ins vdataClass:$vdata, + (ins vdata_op:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc), - (ins vdataClass:$vdata, vaddrClass:$vaddr, + (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc) )); } @@ -678,6 +699,7 @@ let has_glc = 0; let has_dlc = 0; let has_tfe = 0; + let has_sccb = 0; let maybeAtomic = 1; } @@ -696,6 +718,7 @@ let PseudoInstr = opName # "_" # getAddrName.ret; let glc_value = 0; let dlc_value = 0; + let sccb_value = 0; let IsAtomicNoRet = 1; let AsmMatchConverter = "cvtMubufAtomic"; } @@ -705,9 +728,10 @@ list pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, - RegisterClass vdataClassCopy = vdataClass> + RegisterClass vdataClassCopy = vdataClass, + RegisterOperand vdata_op = getLdStRegisterOperand.ret> : MUBUF_Atomic_Pseudo.ret, " $vdata, " # getMUBUFAsmOps.ret # "$glc1$slc", pattern>, @@ -715,6 +739,7 @@ let PseudoInstr = opName # "_rtn_" # getAddrName.ret; let glc_value = 1; let dlc_value = 0; + let sccb_value = 0; let IsAtomicRet = 1; let Constraints = "$vdata = $vdata_in"; let DisableEncoding = "$vdata_in"; @@ -1108,6 +1133,15 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_noret_32 >; + +let OtherPredicates = [isGFX90APlus] in { +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_add_f32", VGPR_32, f32, atomic_load_fadd_global_32 +>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_load_fadd_v2f16_global_32 +>; +} } // End SubtargetPredicate = HasAtomicFaddInsts //===----------------------------------------------------------------------===// @@ -1156,6 +1190,17 @@ } // End let SubtargetPredicate = isGFX7Plus +let SubtargetPredicate = isGFX90APlus in { + def BUFFER_WBL2 : MUBUF_Invalidate<"buffer_wbl2"> { + } + def BUFFER_INVL2 : MUBUF_Invalidate<"buffer_invl2"> { + } + + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; +} // End SubtargetPredicate = isGFX90APlus + let SubtargetPredicate = isGFX10Plus in { def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">; def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">; @@ -1178,7 +1223,7 @@ timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1186,7 +1231,7 @@ timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1194,7 +1239,7 @@ timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1204,7 +1249,7 @@ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; } @@ -1264,7 +1309,7 @@ timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) getVregSrcForVT.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1273,7 +1318,7 @@ (!cast(opcode # _OFFEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1282,7 +1327,7 @@ (!cast(opcode # _IDXEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1293,7 +1338,7 @@ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; } @@ -1462,15 +1507,24 @@ defm : BufferAtomicPatterns_NO_RTN; } +let SubtargetPredicate = isGFX90APlus in { + defm : BufferAtomicPatterns; + defm : BufferAtomicPatterns; + + defm : BufferAtomicPatterns; + defm : BufferAtomicPatterns; + defm : BufferAtomicPatterns; +} // End SubtargetPredicate = isGFX90APlus + def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), - (extract_slc $cachepolicy)), sub0) + (extract_slc $cachepolicy)), VReg_64)), sub0) >; def : GCNPat< @@ -1478,10 +1532,10 @@ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)), sub0) >; @@ -1490,10 +1544,10 @@ i32:$data, i32:$cmp, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)), sub0) >; @@ -1502,11 +1556,11 @@ i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), + SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)), VReg_64)), sub0) >; @@ -1522,12 +1576,12 @@ def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0, 0) >; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0, 0) >; } @@ -1572,12 +1626,12 @@ def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0) >; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0) >; } @@ -1639,12 +1693,12 @@ def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1673,13 +1727,13 @@ def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), - (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, 0) >; } @@ -1726,7 +1780,7 @@ (!cast(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1735,7 +1789,7 @@ (!cast(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1744,7 +1798,7 @@ (!cast(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1755,7 +1809,7 @@ SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; } @@ -1794,7 +1848,7 @@ (!cast(opcode # _OFFSET_exact) getVregSrcForVT.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1803,7 +1857,7 @@ (!cast(opcode # _IDXEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1812,7 +1866,7 @@ (!cast(opcode # _OFFEN_exact) getVregSrcForVT.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; def : GCNPat< @@ -1823,7 +1877,7 @@ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), - (extract_swz $auxiliary)) + (extract_swz $auxiliary), (extract_sccb $auxiliary)) >; } @@ -1870,7 +1924,7 @@ let Inst{24-18} = op; let Inst{31-26} = 0x38; let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{54} = !if(ps.has_slc, slc, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); @@ -2124,7 +2178,7 @@ let Inst{18-16} = op; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{54} = !if(ps.has_slc, slc, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); @@ -2206,33 +2260,53 @@ // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -class MUBUF_Real_vi op, MUBUF_Pseudo ps> : +class MUBUF_Real_Base_vi op, MUBUF_Pseudo ps, int Enc> : MUBUF_Real, Enc64, - SIMCInstr { - let AssemblerPredicate = isGFX8GFX9; - let DecoderNamespace = "GFX8"; + SIMCInstr { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{15} = !if(ps.has_sccb, sccb, ps.sccb_value); let Inst{16} = ps.lds; let Inst{17} = !if(ps.has_slc, slc, ?); let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); - let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } +class MUBUF_Real_vi op, MUBUF_Pseudo ps> : + MUBUF_Real_Base_vi { + let AssemblerPredicate = isGFX8GFX9NotGFX90A; + let DecoderNamespace = "GFX8"; + + let Inst{55} = !if(ps.has_tfe, tfe, ?); +} + +class MUBUF_Real_gfx90a op, MUBUF_Pseudo ps> : + MUBUF_Real_Base_vi { + let AssemblerPredicate = isGFX90APlus; + let DecoderNamespace = "GFX90A"; + let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands); + + let Inst{55} = acc; +} + +multiclass MUBUF_Real_vi_gfx90a op, MUBUF_Pseudo ps> { + def _vi : MUBUF_Real_vi; + def _gfx90a : MUBUF_Real_gfx90a; +} + multiclass MUBUF_Real_AllAddr_vi op> { - def _OFFSET_vi : MUBUF_Real_vi (NAME#"_OFFSET")>; - def _OFFEN_vi : MUBUF_Real_vi (NAME#"_OFFEN")>; - def _IDXEN_vi : MUBUF_Real_vi (NAME#"_IDXEN")>; - def _BOTHEN_vi : MUBUF_Real_vi (NAME#"_BOTHEN")>; + defm _OFFSET : MUBUF_Real_vi_gfx90a (NAME#"_OFFSET")>; + defm _OFFEN : MUBUF_Real_vi_gfx90a (NAME#"_OFFEN")>; + defm _IDXEN : MUBUF_Real_vi_gfx90a (NAME#"_IDXEN")>; + defm _BOTHEN : MUBUF_Real_vi_gfx90a (NAME#"_BOTHEN")>; } multiclass MUBUF_Real_AllAddr_Lds_vi op> { @@ -2254,6 +2328,24 @@ MUBUFLdsTable<1, NAME # "_IDXEN_vi">; def _LDS_BOTHEN_vi : MUBUF_Real_vi (NAME#"_LDS_BOTHEN")>, MUBUFLdsTable<1, NAME # "_BOTHEN_vi">; + + def _OFFSET_gfx90a : MUBUF_Real_gfx90a (NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_gfx90a">; + def _OFFEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_gfx90a">; + def _IDXEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_gfx90a">; + def _BOTHEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_gfx90a">; + + def _LDS_OFFSET_gfx90a : MUBUF_Real_gfx90a (NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_gfx90a">; + def _LDS_OFFEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_gfx90a">; + def _LDS_IDXEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_gfx90a">; + def _LDS_BOTHEN_gfx90a : MUBUF_Real_gfx90a (NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_gfx90a">; } class MUBUF_Real_gfx80 op, MUBUF_Pseudo ps> : @@ -2272,7 +2364,7 @@ let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); @@ -2287,10 +2379,10 @@ multiclass MUBUF_Real_Atomic_vi op> : MUBUF_Real_AllAddr_vi { - def _OFFSET_RTN_vi : MUBUF_Real_vi (NAME#"_OFFSET_RTN")>; - def _OFFEN_RTN_vi : MUBUF_Real_vi (NAME#"_OFFEN_RTN")>; - def _IDXEN_RTN_vi : MUBUF_Real_vi (NAME#"_IDXEN_RTN")>; - def _BOTHEN_RTN_vi : MUBUF_Real_vi (NAME#"_BOTHEN_RTN")>; + defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a (NAME#"_OFFSET_RTN")>; + defm _OFFEN_RTN : MUBUF_Real_vi_gfx90a (NAME#"_OFFEN_RTN")>; + defm _IDXEN_RTN : MUBUF_Real_vi_gfx90a (NAME#"_IDXEN_RTN")>; + defm _BOTHEN_RTN : MUBUF_Real_vi_gfx90a (NAME#"_BOTHEN_RTN")>; } defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>; @@ -2376,24 +2468,34 @@ defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>; -def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; +defm BUFFER_STORE_LDS_DWORD : MUBUF_Real_vi_gfx90a <0x3d, BUFFER_STORE_LDS_DWORD>; +let AssemblerPredicate = isGFX8GFX9 in { def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; +} // End AssemblerPredicate = isGFX8GFX9 let SubtargetPredicate = HasAtomicFaddInsts in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_AllAddr_vi <0x4d>; -defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_AllAddr_vi <0x4e>; +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; } // End SubtargetPredicate = HasAtomicFaddInsts -class MTBUF_Real_vi op, MTBUF_Pseudo ps> : +let SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus in { + defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; + defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>; + defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>; +} // End SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus + +def BUFFER_WBL2_gfx90a : MUBUF_Real_gfx90a<0x28, BUFFER_WBL2> { +} +def BUFFER_INVL2_gfx90a : MUBUF_Real_gfx90a<0x29, BUFFER_INVL2>; + +class MTBUF_Real_Base_vi op, MTBUF_Pseudo ps, int Enc> : MTBUF_Real, Enc64, - SIMCInstr { - let AssemblerPredicate = isGFX8GFX9; - let DecoderNamespace = "GFX8"; + SIMCInstr { let Inst{11-0} = !if(ps.has_offset, offset, ?); let Inst{12} = ps.offen; @@ -2404,18 +2506,40 @@ let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{53} = !if(ps.has_sccb, sccb, ps.sccb_value); let Inst{54} = !if(ps.has_slc, slc, ?); - let Inst{55} = !if(ps.has_tfe, tfe, ?); let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } +class MTBUF_Real_vi op, MTBUF_Pseudo ps> : + MTBUF_Real_Base_vi { + let AssemblerPredicate = isGFX8GFX9NotGFX90A; + let DecoderNamespace = "GFX8"; + + let Inst{55} = !if(ps.has_tfe, tfe, ?); +} + +class MTBUF_Real_gfx90a op, MTBUF_Pseudo ps> : + MTBUF_Real_Base_vi { + let AssemblerPredicate = isGFX90APlus; + let DecoderNamespace = "GFX90A"; + let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands); + + let Inst{55} = acc; +} + +multiclass MTBUF_Real_vi_gfx90a op, MTBUF_Pseudo ps> { + def _vi : MTBUF_Real_vi; + def _gfx90a : MTBUF_Real_gfx90a; +} + multiclass MTBUF_Real_AllAddr_vi op> { - def _OFFSET_vi : MTBUF_Real_vi (NAME#"_OFFSET")>; - def _OFFEN_vi : MTBUF_Real_vi (NAME#"_OFFEN")>; - def _IDXEN_vi : MTBUF_Real_vi (NAME#"_IDXEN")>; - def _BOTHEN_vi : MTBUF_Real_vi (NAME#"_BOTHEN")>; + defm _OFFSET : MTBUF_Real_vi_gfx90a (NAME#"_OFFSET")>; + defm _OFFEN : MTBUF_Real_vi_gfx90a (NAME#"_OFFEN")>; + defm _IDXEN : MTBUF_Real_vi_gfx90a (NAME#"_IDXEN")>; + defm _BOTHEN : MTBUF_Real_vi_gfx90a (NAME#"_BOTHEN")>; } class MTBUF_Real_gfx80 op, MTBUF_Pseudo ps> : @@ -2434,7 +2558,7 @@ let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata{7-0}, ?); let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); let Inst{54} = !if(ps.has_slc, slc, ?); let Inst{55} = !if(ps.has_tfe, tfe, ?); Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -67,17 +67,20 @@ let AsmMatchConverter = ds.AsmMatchConverter; // encoding fields - bits<8> vdst; + bits<10> vdst; bits<1> gds; bits<8> addr; - bits<8> data0; - bits<8> data1; + bits<10> data0; + bits<10> data1; bits<8> offset0; bits<8> offset1; bits<16> offset; let offset0 = !if(ds.has_offset, offset{7-0}, ?); let offset1 = !if(ds.has_offset, offset{15-8}, ?); + + bits<1> acc = !if(ds.has_vdst, vdst{9}, + !if(!or(ds.has_data0, ds.has_gws_data0), data0{9}, 0)); } @@ -86,7 +89,7 @@ class DS_0A1D_NORET : DS_Pseudo.ret:$data0, offset:$offset, gds:$gds), " $data0$offset$gds"> { let has_addr = 0; @@ -97,7 +100,7 @@ class DS_1A1D_NORET : DS_Pseudo.ret:$data0, offset:$offset, gds:$gds), " $addr, $data0$offset$gds"> { let has_data1 = 0; @@ -115,10 +118,18 @@ } } -class DS_1A2D_NORET +multiclass DS_1A1D_NORET_mc_gfx9 { + let has_m0_read = 0 in { + def "" : DS_1A1D_NORET, + AtomicNoRet; + } +} + +class DS_1A2D_NORET.ret> : DS_Pseudo { let has_vdst = 0; @@ -135,10 +146,11 @@ } } -class DS_1A2D_Off8_NORET +class DS_1A2D_Off8_NORET .ret> : DS_Pseudo { @@ -155,10 +167,11 @@ } } -class DS_1A1D_RET +class DS_1A1D_RET .ret> : DS_Pseudo { let hasPostISelHook = 1; @@ -178,12 +191,23 @@ } } +multiclass DS_1A1D_RET_mc_gfx9 { + let has_m0_read = 0 in { + def "" : DS_1A1D_RET, + AtomicNoRet; + } +} + class DS_1A2D_RET + RegisterClass src = rc, + RegisterOperand dst_op = getLdStRegisterOperand.ret, + RegisterOperand src_op = getLdStRegisterOperand.ret> : DS_Pseudo { let hasPostISelHook = 1; @@ -205,10 +229,12 @@ class DS_1A2D_Off8_RET + RegisterClass src = rc, + RegisterOperand dst_op = getLdStRegisterOperand.ret, + RegisterOperand src_op = getLdStRegisterOperand.ret> : DS_Pseudo { let has_offset = 0; @@ -228,11 +254,12 @@ } -class DS_1A_RET +class DS_1A_RET.ret> : DS_Pseudo { let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); @@ -254,7 +281,7 @@ class DS_1A_Off8_RET : DS_Pseudo.ret:$vdst), (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds), " $vdst, $addr$offset0$offset1$gds"> { @@ -273,7 +300,7 @@ } class DS_1A_RET_GDS : DS_Pseudo.ret:$vdst), (ins VGPR_32:$addr, offset:$offset), " $vdst, $addr$offset gds"> { @@ -285,7 +312,7 @@ } class DS_0A_RET : DS_Pseudo.ret:$vdst), (ins offset:$offset, gds:$gds), " $vdst$offset$gds"> { @@ -340,7 +367,8 @@ class DS_GWS_1D : DS_GWS { + (ins getLdStRegisterOperand.ret:$data0, offset:$offset), + " $data0$offset gds"> { let has_gws_data0 = 1; let hasSideEffects = 1; @@ -364,10 +392,11 @@ let has_gds = 0; } -class DS_1A1D_PERMUTE +class DS_1A1D_PERMUTE .ret> : DS_Pseudo { @@ -424,6 +453,11 @@ } // End mayLoad = 0 +let SubtargetPredicate = isGFX90APlus in { + defm DS_ADD_F64 : DS_1A1D_NORET_mc_gfx9<"ds_add_f64", VReg_64>; + defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64 ", VReg_64, "ds_add_f64">; +} // End SubtargetPredicate = isGFX90APlus + defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; @@ -942,6 +976,10 @@ defm : DSAtomicCmpXChg_mc; +let SubtargetPredicate = isGFX90APlus in { +def : DSAtomicRetPat; +} + def : Pat < (SIds_ordered_count i32:$value, i16:$offset), (DS_ORDERED_COUNT $value, (as_i16imm $offset)) @@ -963,10 +1001,10 @@ let Inst{17} = !if(ps.has_gds, gds, ps.gdsValue); let Inst{25-18} = op; let Inst{31-26} = 0x36; - let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0, 0)); - let Inst{47-40} = !if(ps.has_data0, data0, 0); - let Inst{55-48} = !if(ps.has_data1, data1, 0); - let Inst{63-56} = !if(ps.has_vdst, vdst, 0); + let Inst{39-32} = !if(ps.has_addr, addr, !if(ps.has_gws_data0, data0{7-0}, 0)); + let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0); + let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0); + let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0); } //===----------------------------------------------------------------------===// @@ -1181,11 +1219,12 @@ let Inst{15-8} = !if(ds.has_offset1, offset1, 0); let Inst{16} = !if(ds.has_gds, gds, ds.gdsValue); let Inst{24-17} = op; + let Inst{25} = acc; let Inst{31-26} = 0x36; // ds prefix - let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0, 0)); - let Inst{47-40} = !if(ds.has_data0, data0, 0); - let Inst{55-48} = !if(ds.has_data1, data1, 0); - let Inst{63-56} = !if(ds.has_vdst, vdst, 0); + let Inst{39-32} = !if(ds.has_addr, addr, !if(ds.has_gws_data0, data0{7-0}, 0)); + let Inst{47-40} = !if(ds.has_data0, data0{7-0}, 0); + let Inst{55-48} = !if(ds.has_data1, data1{7-0}, 0); + let Inst{63-56} = !if(ds.has_vdst, vdst{7-0}, 0); } def DS_ADD_U32_vi : DS_Real_vi<0x0, DS_ADD_U32>; @@ -1348,3 +1387,8 @@ def DS_WRITE_B128_vi : DS_Real_vi<0xdf, DS_WRITE_B128>; def DS_READ_B96_vi : DS_Real_vi<0xfe, DS_READ_B96>; def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; + +let SubtargetPredicate = isGFX90APlus in { + def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; + def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; +} // End SubtargetPredicate = isGFX90APlus Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -99,12 +99,14 @@ MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; MCOperand decodeOperand_VSrcV216(unsigned Val) const; + MCOperand decodeOperand_VSrcV232(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; MCOperand decodeOperand_VReg_128(unsigned Val) const; MCOperand decodeOperand_VReg_256(unsigned Val) const; MCOperand decodeOperand_VReg_512(unsigned Val) const; + MCOperand decodeOperand_VReg_1024(unsigned Val) const; MCOperand decodeOperand_SReg_32(unsigned Val) const; MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; @@ -117,7 +119,9 @@ MCOperand decodeOperand_SReg_512(unsigned Val) const; MCOperand decodeOperand_AGPR_32(unsigned Val) const; + MCOperand decodeOperand_AReg_64(unsigned Val) const; MCOperand decodeOperand_AReg_128(unsigned Val) const; + MCOperand decodeOperand_AReg_256(unsigned Val) const; MCOperand decodeOperand_AReg_512(unsigned Val) const; MCOperand decodeOperand_AReg_1024(unsigned Val) const; MCOperand decodeOperand_AV_32(unsigned Val) const; @@ -126,12 +130,15 @@ enum OpWidthTy { OPW32, OPW64, + OPW96, OPW128, + OPW160, OPW256, OPW512, OPW1024, OPW16, OPWV216, + OPWV232, OPW_LAST_, OPW_FIRST_ = OPW32 }; @@ -159,8 +166,11 @@ int getTTmpIdx(unsigned Val) const; + const MCInstrInfo *getMCII() const { return MCII.get(); } + bool isVI() const; bool isGFX9() const; + bool isGFX90A() const; bool isGFX9Plus() const; bool isGFX10() const; bool isGFX10Plus() const; Index: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -123,6 +123,7 @@ DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(VReg_256) DECODE_OPERAND_REG(VReg_512) +DECODE_OPERAND_REG(VReg_1024) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) @@ -135,7 +136,9 @@ DECODE_OPERAND_REG(SReg_512) DECODE_OPERAND_REG(AGPR_32) +DECODE_OPERAND_REG(AReg_64) DECODE_OPERAND_REG(AReg_128) +DECODE_OPERAND_REG(AReg_256) DECODE_OPERAND_REG(AReg_512) DECODE_OPERAND_REG(AReg_1024) DECODE_OPERAND_REG(AV_32) @@ -157,6 +160,14 @@ return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); } +static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm)); +} + static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -173,6 +184,14 @@ return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); } +static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512)); +} + static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -181,6 +200,14 @@ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); } +static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512)); +} + static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -197,6 +224,127 @@ return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); } +static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm)); +} + +static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm)); +} + +static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm)); +} + +static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm)); +} + +static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm)); +} + +static bool IsAGPROperand(const MCInst &Inst, int OpIdx, + const MCRegisterInfo *MRI) { + if (OpIdx < 0) + return false; + + const MCOperand &Op = Inst.getOperand(OpIdx); + if (!Op.isReg()) + return false; + + unsigned Sub = MRI->getSubReg(Op.getReg(), AMDGPU::sub0); + auto Reg = Sub ? Sub : Op.getReg(); + return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255; +} + +static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, + unsigned Imm, + AMDGPUDisassembler::OpWidthTy Opw, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + if (!DAsm->isGFX90A()) { + Imm &= 511; + } else { + // If atomic has both vdata and vdst their register classes are tied. + // The bit is decoded along with the vdst, first operand. We need to + // change register class to AGPR if vdst was AGPR. + // If a DS instruction has both data0 and data1 their register classes + // are also tied. + unsigned Opc = Inst.getOpcode(); + uint64_t TSFlags = DAsm->getMCII()->get(Opc).TSFlags; + uint16_t DataNameIdx = (TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + const MCRegisterInfo *MRI = DAsm->getContext().getRegisterInfo(); + int DataIdx = AMDGPU::getNamedOperandIdx(Opc, DataNameIdx); + if ((int)Inst.getNumOperands() == DataIdx) { + int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (IsAGPROperand(Inst, DstIdx, MRI)) + Imm |= 512; + } + + if (TSFlags & SIInstrFlags::DS) { + int Data2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); + if ((int)Inst.getNumOperands() == Data2Idx && + IsAGPROperand(Inst, DataIdx, MRI)) + Imm |= 512; + } + } + return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256)); +} + +static DecodeStatus DecodeAVLdSt_32RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW32, Decoder); +} + +static DecodeStatus DecodeAVLdSt_64RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW64, Decoder); +} + +static DecodeStatus DecodeAVLdSt_96RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW96, Decoder); +} + +static DecodeStatus DecodeAVLdSt_128RegisterClass(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return decodeOperand_AVLdSt_Any(Inst, Imm, + AMDGPUDisassembler::OPW128, Decoder); +} + static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -341,6 +489,12 @@ Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { + Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address); + if (Res) + break; + } + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address); if (Res) break; @@ -351,6 +505,13 @@ if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes(Bytes) << 32) | DW; + + if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { + Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address); + if (Res) + break; + } + Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); if (Res) break; @@ -369,6 +530,7 @@ MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx6_gfx7 || MI.getOpcode() == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F64_e64_gfx90a || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi || MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_gfx10 || MI.getOpcode() == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || @@ -384,6 +546,44 @@ insertNamedMCOperand(MI, MCOperand::createImm(1), AMDGPU::OpName::glc1); } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && + (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) { + // GFX90A lost TFE, its place is occupied by ACC. + int TFEOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); + if (TFEOpIdx != -1) { + auto TFEIter = MI.begin(); + std::advance(TFEIter, TFEOpIdx); + MI.insert(TFEIter, MCOperand::createImm(0)); + } + } + + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::FLAT | + SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) { + if (!isGFX10()) { + int DLCOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dlc); + if (DLCOpIdx != -1) { + auto DLCIter = MI.begin(); + std::advance(DLCIter, DLCOpIdx); + MI.insert(DLCIter, MCOperand::createImm(0)); + } + } + } + + if (Res && (MCII->get(MI.getOpcode()).TSFlags & + (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) { + int SWZOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); + if (SWZOpIdx != -1) { + auto SWZIter = MI.begin(); + std::advance(SWZIter, SWZOpIdx); + MI.insert(SWZIter, MCOperand::createImm(0)); + } + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); @@ -545,7 +745,7 @@ DstSize = (DstSize + 1) / 2; } - if (MI.getOperand(TFEIdx).getImm()) + if (TFEIdx != -1 && MI.getOperand(TFEIdx).getImm()) DstSize += 1; if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) @@ -701,6 +901,10 @@ return decodeSrcOp(OPWV216, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const { + return decodeSrcOp(OPWV232, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -718,10 +922,18 @@ return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255); } +MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255); +} + MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const { return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255); } +MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const { + return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255); +} + MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); } @@ -758,6 +970,10 @@ return createRegOperand(AMDGPU::VReg_512RegClassID, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_1024RegClassID, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { // table-gen generated disassembler doesn't care about operand types // leaving only registry class so SSrc_32 operand turns into SReg_32 @@ -914,8 +1130,10 @@ case OPW128: // splat constants case OPW512: case OPW1024: + case OPWV232: return MCOperand::createImm(getInlineImmVal32(Imm)); case OPW64: + case OPW256: return MCOperand::createImm(getInlineImmVal64(Imm)); case OPW16: case OPWV216: @@ -935,8 +1153,14 @@ case OPW16: case OPWV216: return VGPR_32RegClassID; - case OPW64: return VReg_64RegClassID; + case OPW64: + case OPWV232: return VReg_64RegClassID; + case OPW96: return VReg_96RegClassID; case OPW128: return VReg_128RegClassID; + case OPW160: return VReg_160RegClassID; + case OPW256: return VReg_256RegClassID; + case OPW512: return VReg_512RegClassID; + case OPW1024: return VReg_1024RegClassID; } } @@ -950,8 +1174,11 @@ case OPW16: case OPWV216: return AGPR_32RegClassID; - case OPW64: return AReg_64RegClassID; + case OPW64: + case OPWV232: return AReg_64RegClassID; + case OPW96: return AReg_96RegClassID; case OPW128: return AReg_128RegClassID; + case OPW160: return AReg_160RegClassID; case OPW256: return AReg_256RegClassID; case OPW512: return AReg_512RegClassID; case OPW1024: return AReg_1024RegClassID; @@ -969,8 +1196,11 @@ case OPW16: case OPWV216: return SGPR_32RegClassID; - case OPW64: return SGPR_64RegClassID; + case OPW64: + case OPWV232: return SGPR_64RegClassID; + case OPW96: return SGPR_96RegClassID; case OPW128: return SGPR_128RegClassID; + case OPW160: return SGPR_160RegClassID; case OPW256: return SGPR_256RegClassID; case OPW512: return SGPR_512RegClassID; } @@ -986,7 +1216,8 @@ case OPW16: case OPWV216: return TTMP_32RegClassID; - case OPW64: return TTMP_64RegClassID; + case OPW64: + case OPWV232: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; case OPW256: return TTMP_256RegClassID; case OPW512: return TTMP_512RegClassID; @@ -1040,6 +1271,7 @@ case OPWV216: return decodeSpecialReg32(Val); case OPW64: + case OPWV232: return decodeSpecialReg64(Val); default: llvm_unreachable("unexpected immediate type"); @@ -1209,6 +1441,10 @@ bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); } +bool AMDGPUDisassembler::isGFX90A() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; +} + bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); } bool AMDGPUDisassembler::isGFX10() const { return AMDGPU::isGFX10(STI); } Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -54,6 +54,8 @@ bits<1> glcValue = 0; bits<1> has_dlc = 1; bits<1> dlcValue = 0; + bits<1> has_sccb = 1; + bits<1> sccbValue = 0; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -88,14 +90,17 @@ // encoding fields bits<8> vaddr; - bits<8> vdata; + bits<10> vdata; bits<7> saddr; - bits<8> vdst; + bits<10> vdst; bits<1> slc; bits<1> glc; bits<1> dlc; + // Only valid on gfx90a+ + bits<1> sccb; + // Only valid on gfx9 bits<1> lds = 0; // XXX - What does this actually do? @@ -106,7 +111,8 @@ // Signed offset. Highest bit ignored for flat and treated as 12-bit // unsigned for flat accesses. bits<13> offset; - bits<1> nv = 0; // XXX - What does this actually do? + // GFX90A+ only: instruction uses AccVGPR for data + bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0)); // We don't use tfe right now, and it was removed in gfx9. bits<1> tfe = 0; @@ -121,12 +127,12 @@ let Inst{24-18} = op; let Inst{31-26} = 0x37; // Encoding. let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); - let Inst{47-40} = !if(ps.has_data, vdata, ?); + let Inst{47-40} = !if(ps.has_data, vdata{7-0}, ?); let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0); // 54-48 is reserved. - let Inst{55} = nv; // nv on GFX9+, TFE before. - let Inst{63-56} = !if(ps.has_vdst, vdst, ?); + let Inst{55} = acc; // nv on GFX9+, TFE before. AccVGPR for data on GFX90A. + let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?); } class GlobalSaddrTable { @@ -139,9 +145,10 @@ // saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo : FLAT_Pseudo< + bit HasSaddr = 0, bit EnableSaddr = 0, + RegisterOperand vdata_op = getLdStRegisterOperand.ret> : FLAT_Pseudo< opName, - (outs regClass:$vdst), + (outs vdata_op:$vdst), !con( !con( !if(EnableSaddr, @@ -149,9 +156,9 @@ (ins VReg_64:$vaddr)), (ins flat_offset:$offset)), // FIXME: Operands with default values do not work with following non-optional operands. - !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in), - (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { + !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, SCCB:$sccb, vdata_op:$vdst_in), + (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))), + " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc$sccb"> { let has_data = 0; let mayLoad = 1; let has_saddr = HasSaddr; @@ -169,10 +176,10 @@ (outs), !con( !if(EnableSaddr, - (ins VGPR_32:$vaddr, vdataClass:$vdata, SReg_64:$saddr), - (ins VReg_64:$vaddr, vdataClass:$vdata)), - (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc)), - " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc"> { + (ins VGPR_32:$vaddr, getLdStRegisterOperand.ret:$vdata, SReg_64:$saddr), + (ins VReg_64:$vaddr, getLdStRegisterOperand.ret:$vdata)), + (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb)), + " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc$dlc$sccb"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -196,9 +203,9 @@ opName, (outs regClass:$vdst), !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)), - (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), + (ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb), !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), - " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> { let is_flat_global = 1; let has_data = 0; let mayLoad = 1; @@ -234,8 +241,8 @@ opName, (outs), !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)), - (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), - " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc, SCCB_0:$sccb)), + " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> { let is_flat_global = 1; let mayLoad = 0; let mayStore = 1; @@ -266,16 +273,16 @@ bit EnableVaddr = !not(EnableSaddr)> : FLAT_Pseudo< opName, - (outs regClass:$vdst), + (outs getLdStRegisterOperand.ret:$vdst), !con( !if(EnableSaddr, (ins SReg_32_XEXEC_HI:$saddr, flat_offset:$offset), !if(EnableVaddr, (ins VGPR_32:$vaddr, flat_offset:$offset), (ins flat_offset:$offset))), - !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, regClass:$vdst_in), - (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + !if(HasTiedOutput, (ins GLC:$glc, SLC:$slc, DLC:$dlc, SCCB:$sccb, getLdStRegisterOperand.ret:$vdst_in), + (ins GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))), + " $vdst, "#!if(EnableVaddr, "$vaddr, ", "off, ")#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> { let has_data = 0; let mayLoad = 1; let has_saddr = 1; @@ -289,15 +296,16 @@ } class FLAT_Scratch_Store_Pseudo : FLAT_Pseudo< + bit EnableVaddr = !not(EnableSaddr), + RegisterOperand vdata_op = getLdStRegisterOperand.ret> : FLAT_Pseudo< opName, (outs), !if(EnableSaddr, - (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), + (ins vdata_op:$vdata, SReg_32_XEXEC_HI:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb), !if(EnableVaddr, - (ins vdataClass:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), - (ins vdataClass:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc))), - " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { + (ins vdata_op:$vdata, VGPR_32:$vaddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb), + (ins vdata_op:$vdata, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc, SCCB_0:$sccb))), + " "#!if(EnableVaddr, "$vaddr", "off")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc$sccb"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -344,6 +352,8 @@ let has_dlc = 0; let dlcValue = 0; let has_vdst = 0; + let has_sccb = 1; + let sccbValue = 0; let maybeAtomic = 1; let IsAtomicNoRet = 1; } @@ -355,6 +365,7 @@ let has_vdst = 1; let glcValue = 1; let dlcValue = 0; + let sccbValue = 0; let IsAtomicNoRet = 0; let IsAtomicRet = 1; let PseudoInstr = NAME # "_RTN"; @@ -367,11 +378,12 @@ SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType.ret> { + bit isFP = isFloatType.ret, + RegisterOperand data_op = getLdStRegisterOperand.ret> { def "" : FLAT_AtomicNoRet_Pseudo , + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb), + " $vaddr, $vdata$offset$slc$sccb">, GlobalSaddrTable<0, opName>, AtomicNoRet { let PseudoInstr = NAME; @@ -380,9 +392,9 @@ } def _RTN : FLAT_AtomicRet_Pseudo .ret:$vdst), + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc, SCCB_0:$sccb), + " $vdst, $vaddr, $vdata$offset$glc1$slc$sccb", [(set vt:$vdst, (atomic (FLATOffset i64:$vaddr, i16:$offset), data_vt:$vdata))]>, GlobalSaddrTable<0, opName#"_rtn">, @@ -399,12 +411,13 @@ SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType.ret> { + bit isFP = isFloatType.ret, + RegisterOperand data_op = getLdStRegisterOperand.ret> { def "" : FLAT_AtomicNoRet_Pseudo , + (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb), + " $vaddr, $vdata, off$offset$slc$sccb">, GlobalSaddrTable<0, opName>, AtomicNoRet { let has_saddr = 1; @@ -414,8 +427,8 @@ def _SADDR : FLAT_AtomicNoRet_Pseudo , + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, SLC_0:$slc, SCCB_0:$sccb), + " $vaddr, $vdata, $saddr$offset$slc$sccb">, GlobalSaddrTable<1, opName>, AtomicNoRet { let has_saddr = 1; @@ -432,12 +445,14 @@ SDPatternOperator atomic = null_frag, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - bit isFP = isFloatType.ret> { + bit isFP = isFloatType.ret, + RegisterOperand data_op = getLdStRegisterOperand.ret, + RegisterOperand vdst_op = getLdStRegisterOperand.ret> { def _RTN : FLAT_AtomicRet_Pseudo , GlobalSaddrTable<0, opName#"_rtn">, @@ -447,9 +462,9 @@ } def _SADDR_RTN : FLAT_AtomicRet_Pseudo , + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, GLC_1:$glc1, SLC_0:$slc, SCCB_0:$sccb), + " $vdst, $vaddr, $vdata, $saddr$offset$glc1$slc$sccb">, GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; @@ -608,6 +623,15 @@ } // End SubtargetPredicate = isGFX7GFX10 +let SubtargetPredicate = isGFX90APlus in { + defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; +} // End SubtargetPredicate = isGFX90APlus + defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -780,6 +804,15 @@ "global_atomic_pk_add_f16", VGPR_32, v2f16 >; } // End OtherPredicates = [HasAtomicFaddInsts] + +let OtherPredicates = [isGFX90APlus] in { + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < + "global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd + >; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < + "global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd + >; +} // End OtherPredicates = [isGFX90APlus] } // End is_flat_global = 1 //===----------------------------------------------------------------------===// @@ -794,17 +827,17 @@ class FlatLoadPat_D16 : GCNPat < (node (FLATOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (inst $vaddr, $offset, 0, 0, 0, 0, $in) >; class FlatSignedLoadPat_D16 : GCNPat < (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (inst $vaddr, $offset, 0, 0, 0, 0, $in) >; class GlobalLoadSaddrPat_D16 : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, 0, 0, 0, $in) + (inst $saddr, $voffset, $offset, 0, 0, 0, 0, $in) >; class FlatLoadSignedPat : GCNPat < @@ -895,7 +928,7 @@ class ScratchLoadSignedPat_D16 : GCNPat < (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in), - (inst $vaddr, $offset, 0, 0, 0, $in) + (inst $vaddr, $offset, 0, 0, 0, 0, $in) >; class ScratchStoreSignedPat : GCNPat < @@ -910,7 +943,7 @@ class ScratchLoadSaddrPat_D16 : GCNPat < (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), - (inst $saddr, $offset, 0, 0, 0, $in) + (inst $saddr, $offset, 0, 0, 0, 0, $in) >; class ScratchStoreSaddrPat ; } +let OtherPredicates = [isGFX90APlus] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F32", atomic_load_fadd_global_32, f32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", atomic_load_fadd_v2f16_global_32, v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", atomic_load_fadd_global_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", atomic_load_fmin_global_64, f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", atomic_load_fmax_global_64, f64>; +def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; +} + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -1345,6 +1389,8 @@ SIMCInstr { let AssemblerPredicate = isGFX8GFX9; let DecoderNamespace = "GFX8"; + + let Inst{25} = !if(ps.has_sccb, sccb, ps.sccbValue); } multiclass FLAT_Real_AllAddr_vi op> { @@ -1492,6 +1538,14 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; +let SubtargetPredicate = isGFX90APlus in { + defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_vi<0x4f, FLAT_ATOMIC_ADD_F64>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_vi<0x50, FLAT_ATOMIC_MIN_F64>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_vi<0x51, FLAT_ATOMIC_MAX_F64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_vi<0x4f>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_vi<0x50>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51>; +} // End SubtargetPredicate = isGFX90APlus //===----------------------------------------------------------------------===// // GFX10. @@ -1701,7 +1755,7 @@ let SubtargetPredicate = HasAtomicFaddInsts in { -defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Real_AllAddr_vi <0x04d>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Real_AllAddr_vi <0x04e>; +defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e>; } // End SubtargetPredicate = HasAtomicFaddInsts Index: llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -137,7 +137,8 @@ case AMDGPU::IMPLICIT_DEF: return nullptr; case AMDGPU::COPY: - case AMDGPU::V_MOV_B32_e32: { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B64_PSEUDO: { auto &Op1 = Def->getOperand(1); if (Op1.isImm()) return &Op1; @@ -151,7 +152,8 @@ MachineInstr &MovMI, RegSubRegPair CombOldVGPR, bool CombBCZ) const { - assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); auto OrigOp = OrigMI.getOpcode(); auto DPPOp = getDPPOp(OrigOp); @@ -174,7 +176,11 @@ const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); if (OldIdx != -1) { assert(OldIdx == NumOperands); - assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + assert(isOfRegClass( + CombOldVGPR, + *MRI->getRegClass( + TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg()), + *MRI)); auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI); DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef, CombOldVGPR.SubReg); @@ -325,8 +331,10 @@ return nullptr; } CombOldVGPR = getRegSubRegPair(*Src1); - if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) { - LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n"); + auto MovDst = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); + const TargetRegisterClass *RC = MRI->getRegClass(MovDst->getReg()); + if (!isOfRegClass(CombOldVGPR, *RC, *MRI)) { + LLVM_DEBUG(dbgs() << " failed: src1 has wrong register class\n"); return nullptr; } } @@ -346,7 +354,8 @@ } bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { - assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp || + MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst); @@ -362,6 +371,17 @@ return false; } + if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { + auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl); + assert(DppCtrl && DppCtrl->isImm()); + if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) { + LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported" + " control value\n"); + // Let it split, then control may become legal. + return false; + } + } + auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask); assert(RowMaskOpnd && RowMaskOpnd->isImm()); auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask); @@ -430,8 +450,9 @@ auto CombOldVGPR = getRegSubRegPair(*OldOpnd); // try to reuse previous old reg if its undefined (IMPLICIT_DEF) if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef + const TargetRegisterClass *RC = MRI->getRegClass(DPPMovReg); CombOldVGPR = RegSubRegPair( - MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + MRI->createVirtualRegister(RC)); auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg); DPPMIs.push_back(UndefInst.getInstr()); @@ -581,12 +602,17 @@ Changed = true; ++NumDPPMovsCombined; } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) { - auto Split = TII->expandMovDPP64(MI); - for (auto M : { Split.first, Split.second }) { - if (combineDPPMov(*M)) - ++NumDPPMovsCombined; + if (ST.has64BitDPP() && combineDPPMov(MI)) { + Changed = true; + ++NumDPPMovsCombined; + } else { + auto Split = TII->expandMovDPP64(MI); + for (auto M : { Split.first, Split.second }) { + if (M && combineDPPMov(*M)) + ++NumDPPMovsCombined; + } + Changed = true; } - Changed = true; } } } Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -94,6 +94,9 @@ bool fixLdsBranchVmemWARHazard(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); + int checkMAIHazards908(MachineInstr *MI); + int checkMAIHazards90A(MachineInstr *MI); + int checkMAIVALUHazards(MachineInstr *MI); int checkMAILdStHazards(MachineInstr *MI); public: Index: llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -32,7 +32,7 @@ TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) { - MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5; + MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5; TSchedModel.init(&ST); } @@ -87,6 +87,25 @@ } } +static bool isDGEMM(unsigned Opcode) { + return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 || + Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64; +} + +static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + + if (!SIInstrInfo::isMAI(MI) || + isDGEMM(Opcode) || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) + return false; + + return true; +} + static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI) { if (TII.isAlwaysGDS(MI.getOpcode())) @@ -165,6 +184,11 @@ if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) return HazardType; + if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) + return HazardType; + if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) return HazardType; @@ -274,6 +298,11 @@ if (isRWLane(MI->getOpcode())) WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0) + WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI)); + if (MI->isInlineAsm()) return std::max(WaitStates, checkInlineAsmHazards(MI)); @@ -603,7 +632,7 @@ const int VmemSgprWaitStates = 5; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; for (const MachineOperand &Use : VMEM->uses()) { - if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg())) continue; int WaitStatesNeededForUse = @@ -739,7 +768,7 @@ const int VALUWaitStates = 1; int WaitStatesNeeded = 0; - if (!TRI->isVGPR(MRI, Def.getReg())) + if (!TRI->isVectorRegister(MRI, Def.getReg())) return WaitStatesNeeded; Register Reg = Def.getReg(); auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { @@ -1187,6 +1216,10 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) { assert(SIInstrInfo::isMAI(*MI)); + return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI); +} + +int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { int WaitStatesNeeded = 0; unsigned Opc = MI->getOpcode(); @@ -1353,8 +1386,166 @@ return WaitStatesNeeded; } +int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { + int WaitStatesNeeded = 0; + unsigned Opc = MI->getOpcode(); + + auto IsMFMAFn = [] (MachineInstr *MI) { + return SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + }; + + auto IsLegacyVALUFn = [&IsMFMAFn] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI) && !IsMFMAFn(MI); + }; + + auto IsLegacyVALUNotDotFn = [&IsMFMAFn] (MachineInstr *MI) { + return SIInstrInfo::isVALU(*MI) && + !IsMFMAFn(MI) && !SIInstrInfo::isDOT(*MI); + }; + + if (!IsMFMAFn(MI)) + return WaitStatesNeeded; + + const int VALUWritesExecWaitStates = 4; + int WaitStatesNeededForUse = VALUWritesExecWaitStates - + getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn, + VALUWritesExecWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + + // Loop for both DGEMM and S/HGEMM 2nd instruction. + for (const MachineOperand &Use : MI->explicit_uses()) { + const int LegacyVALUNotDotWritesVGPRWaitStates = 2; + const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2; + const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8; + const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16; + const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3; + const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9; + const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17; + const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9; + const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4; + const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5; + const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11; + const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19; + const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6; + const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11; + const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4; + const int MaxWaitStates = 19; + + if (!Use.isReg()) + continue; + unsigned Reg = Use.getReg(); + bool FullReg; + MachineInstr *MI1; + + auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI)) + return false; + if (!isDGEMM(MI->getOpcode()) && !isXDL(ST, *MI)) + return false; + Register DstReg = MI->getOperand(0).getReg(); + FullReg = (DstReg == Reg); + MI1 = MI; + return TRI.regsOverlap(DstReg, Reg); + }; + + WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates - + getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn, + MaxWaitStates); + if (NumWaitStates == std::numeric_limits::max()) + continue; + + int OpNo = MI->getOperandNo(&Use); + unsigned Opc1 = MI1->getOpcode(); + int NeedWaitStates = 0; + if (OpNo == SrcCIdx) { + if (!isDGEMM(Opc) && isDGEMM(Opc1)) { + NeedWaitStates = 0; + } else if (FullReg) { + if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) && + (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 || + Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64)) + NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates; + } else { + switch (Opc1) { + case AMDGPU::V_MFMA_F64_16X16X4F64_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + if (!isXDL(ST, *MI)) + NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates; + break; + case AMDGPU::V_MFMA_F64_4X4X4F64_e64: + case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: + if (!isXDL(ST, *MI)) + NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates; + break; + default: + switch (TSchedModel.computeInstrLatency(MI1)) { + case 2: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates; + break; + case 8: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = isDGEMM(Opc) + ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates + : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates; + } + } + } + } else { + switch (Opc1) { + case AMDGPU::V_MFMA_F64_16X16X4F64_e64: + case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64: + NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates; + break; + case AMDGPU::V_MFMA_F64_4X4X4F64_e64: + case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64: + NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates; + break; + default: + switch (TSchedModel.computeInstrLatency(MI1)) { + case 2: + NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates; + } + } + } + if (WaitStatesNeeded >= NeedWaitStates) + continue; + + WaitStatesNeededForUse = NeedWaitStates - NumWaitStates; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + + return WaitStatesNeeded; +} + int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) { - if (!ST.hasMAIInsts()) + // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards() + if (!ST.hasMAIInsts() || ST.hasGFX90AInsts()) return 0; int WaitStatesNeeded = 0; @@ -1399,6 +1590,234 @@ return WaitStatesNeeded; } +int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { + if (!ST.hasGFX90AInsts()) + return 0; + + auto IsMFMAFn = [] (MachineInstr *MI) -> bool { + return SIInstrInfo::isMAI(*MI) && + MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64; + }; + + auto IsDGEMMFn = [] (MachineInstr *MI) -> bool { + return isDGEMM(MI->getOpcode()); + }; + + // This is checked in checkMAIHazards90A() + if (IsMFMAFn(MI)) + return 0; + + int WaitStatesNeeded = 0; + + bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI) || + SIInstrInfo::isDS(*MI) || + SIInstrInfo::isEXP(*MI); + bool IsVALU = SIInstrInfo::isVALU(*MI); + + MachineInstr *MFMA = nullptr; + unsigned Reg; + auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA, this] (MachineInstr *MI) { + if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI->getOperand(0).getReg(), Reg)) + return false; + if (!isDGEMM(MI->getOpcode()) && !isXDL(ST, *MI)) + return false; + MFMA = MI; + return true; + }; + + MachineInstr *DOT = nullptr; + auto IsDotWriteFn = [&Reg, &DOT, this] (MachineInstr *MI) { + if (!SIInstrInfo::isDOT(*MI) || + !TRI.regsOverlap(MI->getOperand(0).getReg(), Reg)) + return false; + DOT = MI; + return true; + }; + + int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src2); + + if (IsMemOrExport || IsVALU) { + const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5; + const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11; + const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19; + const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9; + const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18; + const int DMFMA4x4WriteVgprVALUReadWaitStates = 6; + const int DMFMA16x16WriteVgprVALUReadWaitStates = 11; + const int DotWriteSameDotReadSrcAB = 3; + const int DotWriteDifferentVALURead = 3; + const int MaxWaitStates = 19; + + for (const MachineOperand &Use : MI->explicit_uses()) { + if (!Use.isReg()) + continue; + Reg = Use.getReg(); + + DOT = nullptr; + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, + MaxWaitStates); + if (DOT) { + int NeedWaitStates = 0; + if (DOT->getOpcode() == MI->getOpcode()) { + if (&Use - &MI->getOperand(0) != SrcCIdx) + NeedWaitStates = DotWriteSameDotReadSrcAB; + } else { + NeedWaitStates = DotWriteDifferentVALURead; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + MFMA = nullptr; + WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, + MaxWaitStates); + if (!MFMA) + continue; + + unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); + int NeedWaitStates = MaxWaitStates; + switch (HazardDefLatency) { + case 2: + NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates; + break; + case 4: + assert(isDGEMM(MFMA->getOpcode())); + NeedWaitStates = + IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates + : DMFMA4x4WriteVgprVALUReadWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = + isDGEMM(MFMA->getOpcode()) + ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates + : DMFMA16x16WriteVgprVALUReadWaitStates + : SMFMA32x32WriteVgprVALUMemExpReadWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + } + + unsigned Opc = MI->getOpcode(); + const int DMFMAToFMA64WaitStates = 2; + if ((Opc == AMDGPU::V_FMA_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_dpp) && + WaitStatesNeeded < DMFMAToFMA64WaitStates) { + int WaitStatesNeededForUse = DMFMAToFMA64WaitStates - + getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + if (!IsVALU && !IsMemOrExport) + return WaitStatesNeeded; + + for (const MachineOperand &Def : MI->defs()) { + const int SMFMA4x4WriteVgprVALUWawWaitStates = 5; + const int SMFMA16x16WriteVgprVALUWawWaitStates = 11; + const int SMFMA32x32WriteVgprVALUWawWaitStates = 19; + const int SMFMA4x4ReadVgprVALUWarWaitStates = 1; + const int SMFMA16x16ReadVgprVALUWarWaitStates = 7; + const int SMFMA32x32ReadVgprVALUWarWaitStates = 15; + const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6; + const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11; + const int DotWriteDifferentVALUWrite = 3; + const int MaxWaitStates = 19; + const int MaxWarWaitStates = 15; + + Reg = Def.getReg(); + + DOT = nullptr; + int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn, + MaxWaitStates); + if (DOT && DOT->getOpcode() != MI->getOpcode()) + WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite - + WaitStatesSinceDef); + + MFMA = nullptr; + WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn, + MaxWaitStates); + if (MFMA) { + int NeedWaitStates = MaxWaitStates; + switch (TSchedModel.computeInstrLatency(MFMA)) { + case 2: + NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates; + break; + case 4: + assert(isDGEMM(MFMA->getOpcode())); + NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates; + break; + case 8: + NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: + NeedWaitStates = isDGEMM(MFMA->getOpcode()) + ? DMFMA16x16WriteVgprVALUWriteWaitStates + : SMFMA32x32WriteVgprVALUWawWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + if (WaitStatesNeeded == MaxWaitStates) + break; + } + + auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA, this] + (MachineInstr *MI) { + if (!IsMFMAFn(MI) || isDGEMM(MI->getOpcode()) || + !MI->readsRegister(Reg, &TRI)) + return false; + + MachineOperand *SrcC = TII.getNamedOperand(*MI, AMDGPU::OpName::src2); + assert(SrcC); + if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg)) + return false; + + MFMA = MI; + return true; + }; + + MFMA = nullptr; + int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn, + MaxWarWaitStates); + if (!MFMA) + continue; + + unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA); + int NeedWaitStates = MaxWaitStates; + switch (HazardDefLatency) { + case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates; + break; + case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates; + break; + case 16: LLVM_FALLTHROUGH; + default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates; + break; + } + + int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse; + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} + bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) { if (!SU->isInstr()) return false; Index: llvm/lib/Target/AMDGPU/GCNProcessors.td =================================================================== --- llvm/lib/Target/AMDGPU/GCNProcessors.td +++ llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -184,6 +184,10 @@ FeatureISAVersion9_0_9.Features >; +def : ProcessorModel<"gfx90a", SIDPFullSpeedModel, + FeatureISAVersion9_0_A.Features +>; + def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; Index: llvm/lib/Target/AMDGPU/GCNRegPressure.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -42,12 +42,19 @@ clear(); } - bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; } + bool empty() const { return getSGPRNum() == 0 && getVGPRNum(false) == 0; } void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } unsigned getSGPRNum() const { return Value[SGPR32]; } - unsigned getVGPRNum() const { return std::max(Value[VGPR32], Value[AGPR32]); } + unsigned getVGPRNum(bool UnifiedVGPRFile) const { + if (UnifiedVGPRFile) { + return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32] + : Value[VGPR32] + Value[AGPR32]; + } + return std::max(Value[VGPR32], Value[AGPR32]); + } + unsigned getAGPRNum() const { return Value[AGPR32]; } unsigned getVGPRTuplesWeight() const { return std::max(Value[VGPR_TUPLE], Value[AGPR_TUPLE]); } @@ -55,7 +62,7 @@ unsigned getOccupancy(const GCNSubtarget &ST) const { return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), - ST.getOccupancyWithNumVGPRs(getVGPRNum())); + ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); } void inc(unsigned Reg, Index: llvm/lib/Target/AMDGPU/GCNRegPressure.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -125,12 +125,14 @@ unsigned MaxOccupancy) const { const auto SGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(getSGPRNum())); - const auto VGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(getVGPRNum())); + const auto VGPROcc = + std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); const auto OtherSGPROcc = std::min(MaxOccupancy, ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); - const auto OtherVGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGPRNum())); + const auto OtherVGPROcc = + std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()))); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -161,7 +163,8 @@ } } return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): - (getVGPRNum() < O.getVGPRNum()); + (getVGPRNum(ST.hasGFX90AInsts()) < + O.getVGPRNum(ST.hasGFX90AInsts())); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -169,7 +172,9 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { OS << "VGPRs: " << Value[VGPR32] << ' '; OS << "AGPRs: " << Value[AGPR32]; - if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; + if (ST) OS << "(O" + << ST->getOccupancyWithNumVGPRs(getVGPRNum(ST->hasGFX90AInsts())) + << ')'; OS << ", SGPRs: " << getSGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; OS << ", LVGPR WT: " << getVGPRTuplesWeight() Index: llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -335,7 +335,7 @@ PressureAfter.print(dbgs())); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && - PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { + PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { Pressure[RegionIdx] = PressureAfter; LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; @@ -366,7 +366,8 @@ unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); - if (PressureAfter.getVGPRNum() > MaxVGPRs || + if (PressureAfter.getVGPRNum(false) > MaxVGPRs || + PressureAfter.getAGPRNum() > MaxVGPRs || PressureAfter.getSGPRNum() > MaxSGPRs) RescheduleRegions[RegionIdx] = true; Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -82,6 +82,7 @@ bool FastFMAF32; bool FastDenormalF32; bool HalfRate64Ops; + bool FullRate64Ops; // Dynamically set bits that enable features. bool FlatForGlobal; @@ -95,6 +96,7 @@ // for XNACK. bool EnableXNACK; + bool EnableTgSplit; bool EnableCuMode; bool TrapHandler; @@ -110,10 +112,11 @@ bool FP64; bool FMA; bool MIMG_R128; - bool GCN3Encoding; + bool IsGCN; bool CIInsts; bool GFX8Insts; bool GFX9Insts; + bool GFX90AInsts; bool GFX10Insts; bool GFX10_3Insts; bool GFX7GFX8GFX9Insts; @@ -132,6 +135,9 @@ bool HasSDWAOutModsVOPC; bool HasDPP; bool HasDPP8; + bool Has64BitDPP; + bool HasPackedFP32Ops; + bool HasExtendedImageInsts; bool HasR128A16; bool HasGFX10A16; bool HasG16; @@ -167,10 +173,16 @@ bool ScalarFlatScratchInsts; bool AddNoCarryInsts; bool HasUnpackedD16VMem; + bool R600ALUInst; + bool CaymanISA; + bool CFALUBug; bool LDSMisalignedBug; bool HasMFMAInlineLiteralBug; + bool HasVertexCache; + short TexVTXClauseSize; bool UnalignedBufferAccess; bool UnalignedDSAccess; + bool HasPackedTID; bool ScalarizeGlobal; bool HasVcmpxPermlaneHazard; @@ -295,6 +307,10 @@ return HalfRate64Ops; } + bool hasFullRate64Ops() const { + return FullRate64Ops; + } + bool hasAddr64() const { return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } @@ -510,6 +526,10 @@ return TargetID.isXnackOnOrAny(); } + bool isTgSplitEnabled() const { + return EnableTgSplit; + } + bool isCuModeEnabled() const { return EnableCuMode; } @@ -796,6 +816,18 @@ return HasDPP8; } + bool has64BitDPP() const { + return Has64BitDPP; + } + + bool hasPackedFP32Ops() const { + return HasPackedFP32Ops; + } + + bool hasExtendedImageInsts() const { + return HasExtendedImageInsts; + } + bool hasR128A16() const { return HasR128A16; } @@ -896,6 +928,10 @@ bool hasHardClauses() const { return getGeneration() >= GFX10; } + bool hasGFX90AInsts() const { return GFX90AInsts; } + + bool hasPackedTID() const { return HasPackedTID; } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -70,6 +70,8 @@ raw_ostream &O); void printDLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSCCB(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, Index: llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -10,6 +10,7 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/MC/MCExpr.h" @@ -207,6 +208,12 @@ printNamedBit(MI, OpNo, O, "dlc"); } +void AMDGPUInstPrinter::printSCCB(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (AMDGPU::isGFX90A(STI)) + printNamedBit(MI, OpNo, O, "scc"); +} + void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "glc"); @@ -601,6 +608,10 @@ case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: case MCOI::OPERAND_IMMEDIATE: printImmediate32(Op.getImm(), STI, O); break; @@ -608,6 +619,7 @@ case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: printImmediate64(Op.getImm(), STI, O); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: @@ -794,7 +806,20 @@ using namespace AMDGPU::DPP; unsigned Imm = MI->getOperand(OpNo).getImm(); - if (Imm <= DppCtrl::QUAD_PERM_LAST) { + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vdst); + int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::src0); + + if (((DstIdx >= 0 && + Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) || + ((Src0Idx >= 0 && + Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) && + !AMDGPU::isLegal64BitDPPControl(Imm)) { + O << " /* 64 bit dpp only supports row_newbcast */"; + return; + } else if (Imm <= DppCtrl::QUAD_PERM_LAST) { O << "quad_perm:["; O << formatDec(Imm & 0x3) << ','; O << formatDec((Imm & 0xc) >> 2) << ','; @@ -854,11 +879,15 @@ O << "row_bcast:31"; } else if ((Imm >= DppCtrl::ROW_SHARE_FIRST) && (Imm <= DppCtrl::ROW_SHARE_LAST)) { - if (!AMDGPU::isGFX10Plus(STI)) { - O << "/* row_share is not supported on ASICs earlier than GFX10 */"; + if (AMDGPU::isGFX90A(STI)) { + O << " row_newbcast:"; + } else if (AMDGPU::isGFX10Plus(STI)) { + O << "row_share:"; + } else { + O << " /* row_newbcast/row_share is not supported on ASICs earlier " + "than GFX90A/GFX10 */"; return; } - O << "row_share:"; printU4ImmDecOperand(MI, OpNo, O); } else if ((Imm >= DppCtrl::ROW_XMASK_FIRST) && (Imm <= DppCtrl::ROW_XMASK_LAST)) { Index: llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -85,7 +85,7 @@ virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; /// \returns True on success, false on failure. - virtual bool EmitCodeEnd() = 0; + virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) = 0; virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -129,7 +129,7 @@ bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitCodeEnd() override; + bool EmitCodeEnd(const MCSubtargetInfo &STI) override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, @@ -177,7 +177,7 @@ bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. - bool EmitCodeEnd() override; + bool EmitCodeEnd(const MCSubtargetInfo &STI) override; void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, Index: llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -86,6 +86,7 @@ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908: AK = GK_GFX908; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; @@ -145,6 +146,7 @@ case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; case GK_GFX908: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX908; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; @@ -258,10 +260,19 @@ return true; } -bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { +bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; - OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; - OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n'; + const uint32_t Encoded_s_nop = 0xbf800000; + uint32_t Encoded_pad = Encoded_s_code_end; + unsigned FillSize = 48; + + if (AMDGPU::isGFX90A(STI)) { + Encoded_pad = Encoded_s_nop; + FillSize = 256; + } + + OS << "\t.p2alignl 6, " << Encoded_pad << '\n'; + OS << "\t.fill " << FillSize << ", 4, " << Encoded_pad << '\n'; return true; } @@ -331,6 +342,12 @@ OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n'; + if (AMDGPU::isGFX90A(STI)) + OS << "\t\t.amdhsa_accum_offset " << + (AMDHSA_BITS_GET(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4 + << '\n'; + if (!ReserveVCC) OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; if (IVersion.Major >= 7 && !ReserveFlatScr) @@ -360,6 +377,10 @@ PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (AMDGPU::isGFX90A(STI)) + PRINT_FIELD(OS, ".amdhsa_tg_split", KD, + compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT); if (IVersion.Major >= 10) { PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, compute_pgm_rsrc1, @@ -616,14 +637,22 @@ return true; } -bool AMDGPUTargetELFStreamer::EmitCodeEnd() { +bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { const uint32_t Encoded_s_code_end = 0xbf9f0000; + const uint32_t Encoded_s_nop = 0xbf800000; + uint32_t Encoded_pad = Encoded_s_code_end; + unsigned FillSize = 48; + + if (AMDGPU::isGFX90A(STI)) { + Encoded_pad = Encoded_s_nop; + FillSize = 256; + } MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.emitValueToAlignment(64, Encoded_s_code_end, 4); - for (unsigned I = 0; I < 48; ++I) - OS.emitInt32(Encoded_s_code_end); + OS.emitValueToAlignment(64, Encoded_pad, 4); + for (unsigned I = 0; I < FillSize; ++I) + OS.emitInt32(Encoded_pad); OS.PopSection(); return true; } Index: llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -234,12 +234,17 @@ case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: return getLit32Encoding(static_cast(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return getLit64Encoding(static_cast(Imm), STI); case AMDGPU::OPERAND_REG_IMM_INT16: Index: llvm/lib/Target/AMDGPU/MIMGInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -11,12 +11,14 @@ // // - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8) // - MIMGEncGfx8: encoding introduced with gfx8 for atomics +// - MIMGEncGfx90a: encoding for gfx90a for atomics // - MIMGEncGfx10Default: gfx10 default (non-NSA) encoding // - MIMGEncGfx10NSA: gfx10 NSA encoding class MIMGEncoding; def MIMGEncGfx6 : MIMGEncoding; def MIMGEncGfx8 : MIMGEncoding; +def MIMGEncGfx90a : MIMGEncoding; def MIMGEncGfx10Default : MIMGEncoding; def MIMGEncGfx10NSA : MIMGEncoding; @@ -207,14 +209,24 @@ // Base class of all pre-gfx10 MIMG instructions. class MIMG_gfx6789 op, dag outs, string dns = ""> : MIMG, MIMGe_gfx6789 { - let SubtargetPredicate = isGFX6GFX7GFX8GFX9; - let AssemblerPredicate = isGFX6GFX7GFX8GFX9; + let SubtargetPredicate = isGFX6GFX7GFX8GFX9NotGFX90A; + let AssemblerPredicate = isGFX6GFX7GFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx6; let d16 = !if(BaseOpcode.HasD16, ?, 0); } +class MIMG_gfx90a op, dag outs, string dns = ""> + : MIMG, MIMGe_gfx90a { + let SubtargetPredicate = isGFX90APlus; + let AssemblerPredicate = isGFX90APlus; + + let MIMGEncoding = MIMGEncGfx90a; + + let d16 = !if(BaseOpcode.HasD16, ?, 0); +} + // Base class of all non-NSA gfx10 MIMG instructions. class MIMG_gfx10 : MIMG, MIMGe_gfx10 { @@ -250,10 +262,23 @@ string dns=""> : MIMG_gfx6789 { let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_NoSampler_Helper_gfx90a + : MIMG_gfx90a .ret:$vdata), dns> { + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -284,12 +309,16 @@ multiclass MIMG_NoSampler_Src_Helper { + bit enableDisasm, + bit ExtendedImageInst = 1> { let ssamp = 0 in { let VAddrDwords = 1 in { if op.HAS_BASE then { def _V1 : MIMG_NoSampler_Helper ; + foreach _ = BoolToList.ret in + def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a ; def _V1_gfx10 : MIMG_NoSampler_gfx10; } @@ -298,6 +327,8 @@ let VAddrDwords = 2 in { if op.HAS_BASE then { def _V2 : MIMG_NoSampler_Helper ; + foreach _ = BoolToList.ret in + def _V2_gfx90a : MIMG_NoSampler_Helper_gfx90a ; def _V2_gfx10 : MIMG_NoSampler_gfx10; def _V2_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; } @@ -306,6 +337,8 @@ let VAddrDwords = 3 in { if op.HAS_BASE then { def _V3 : MIMG_NoSampler_Helper ; + foreach _ = BoolToList.ret in + def _V3_gfx90a : MIMG_NoSampler_Helper_gfx90a ; def _V3_gfx10 : MIMG_NoSampler_gfx10; def _V3_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; } @@ -314,6 +347,8 @@ let VAddrDwords = 4 in { if op.HAS_BASE then { def _V4 : MIMG_NoSampler_Helper ; + foreach _ = BoolToList.ret in + def _V4_gfx90a : MIMG_NoSampler_Helper_gfx90a ; def _V4_gfx10 : MIMG_NoSampler_gfx10; def _V4_nsa_gfx10 : MIMG_NoSampler_nsa_gfx10; @@ -323,7 +358,8 @@ } multiclass MIMG_NoSampler { + bit isResInfo = 0, + bit msaa = 0> { def "" : MIMGBaseOpcode { let Coordinates = !not(isResInfo); let LodOrClampOrMip = mip; @@ -333,15 +369,15 @@ let BaseOpcode = !cast(NAME), mayLoad = !not(isResInfo) in { let VDataDwords = 1 in - defm _V1 : MIMG_NoSampler_Src_Helper ; + defm _V1 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 2 in - defm _V2 : MIMG_NoSampler_Src_Helper ; + defm _V2 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 3 in - defm _V3 : MIMG_NoSampler_Src_Helper ; + defm _V3 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 4 in - defm _V4 : MIMG_NoSampler_Src_Helper ; + defm _V4 : MIMG_NoSampler_Src_Helper ; let VDataDwords = 5 in - defm _V5 : MIMG_NoSampler_Src_Helper ; + defm _V5 : MIMG_NoSampler_Src_Helper ; } } @@ -351,10 +387,24 @@ string dns = ""> : MIMG_gfx6789 { let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Store_Helper_gfx90a + : MIMG_gfx90a { + let InOperandList = !con((ins getLdStRegisterOperand.ret:$vdata, + addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -393,6 +443,8 @@ if op.HAS_BASE then { def _V1 : MIMG_Store_Helper ; + def _V1_gfx90a : MIMG_Store_Helper_gfx90a ; def _V1_gfx10 : MIMG_Store_gfx10 ; } @@ -400,6 +452,7 @@ let VAddrDwords = 2 in { if op.HAS_BASE then { def _V2 : MIMG_Store_Helper ; + def _V2_gfx90a : MIMG_Store_Helper_gfx90a ; def _V2_gfx10 : MIMG_Store_gfx10 ; def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; } @@ -407,6 +460,7 @@ let VAddrDwords = 3 in { if op.HAS_BASE then { def _V3 : MIMG_Store_Helper ; + def _V3_gfx90a : MIMG_Store_Helper_gfx90a ; def _V3_gfx10 : MIMG_Store_gfx10 ; def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; } @@ -414,6 +468,7 @@ let VAddrDwords = 4 in { if op.HAS_BASE then { def _V4 : MIMG_Store_Helper ; + def _V4_gfx90a : MIMG_Store_Helper_gfx90a ; def _V4_gfx10 : MIMG_Store_gfx10 ; def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 ; @@ -450,9 +505,22 @@ let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); - let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; + let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da"; +} + +class MIMG_Atomic_gfx90a_base op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, string dns=""> + : MIMG_gfx90a .ret:$vdst), dns> { + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; + + let InOperandList = (ins getLdStRegisterOperand.ret:$vdata, + addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, + R128A16:$r128, LWE:$lwe, DA:$da); + let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$sccb$glc$slc$r128$lwe$da"; } class MIMG_Atomic_si : MIMG_Atomic_gfx6789_base { - let AssemblerPredicate = isGFX8GFX9; + let AssemblerPredicate = isGFX8GFX9NotGFX90A; let MIMGEncoding = MIMGEncGfx8; } +class MIMG_Atomic_gfx90a + : MIMG_Atomic_gfx90a_base { + let AssemblerPredicate = isGFX90APlus; + let MIMGEncoding = MIMGEncGfx90a; +} + class MIMG_Atomic_gfx10 @@ -512,6 +587,7 @@ } if op.HAS_VI then { def _V1_vi : MIMG_Atomic_vi ; + def _V1_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_BASE then { def _V1_gfx10 : MIMG_Atomic_gfx10 ; @@ -523,6 +599,7 @@ } if op.HAS_VI then { def _V2_vi : MIMG_Atomic_vi ; + def _V2_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_BASE then { def _V2_gfx10 : MIMG_Atomic_gfx10 ; @@ -535,6 +612,7 @@ } if op.HAS_VI then { def _V3_vi : MIMG_Atomic_vi ; + def _V3_gfx90a : MIMG_Atomic_gfx90a ; } if op.HAS_BASE then { def _V3_gfx10 : MIMG_Atomic_gfx10 ; @@ -542,12 +620,13 @@ } } let VAddrDwords = 4 in { - if op.HAS_VI then { - def _V4_vi : MIMG_Atomic_vi ; - } if op.HAS_SI then { def _V4_si : MIMG_Atomic_si ; } + if op.HAS_VI then { + def _V4_vi : MIMG_Atomic_vi ; + def _V4_gfx90a : MIMG_Atomic_gfx90a ; + } if op.HAS_BASE then { def _V4_gfx10 : MIMG_Atomic_gfx10 ; def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 ; @@ -579,10 +658,21 @@ RegisterClass src_rc, string dns=""> : MIMG_gfx6789 { let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); - let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da" + let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$sccb$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMG_Sampler_gfx90a + : MIMG_gfx90a.ret:$vdata), dns> { + let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, UNorm:$unorm, SCCB_0:$sccb, GLC:$glc, SLC:$slc, + R128A16:$r128, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$sccb$glc$slc$r128$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); } @@ -686,13 +776,18 @@ multiclass MIMG_Sampler_Src_Helper { + bit enableDisasm = 0, + bit ExtendedImageInst = 1> { foreach addr = MIMG_Sampler_AddrSizes.MachineInstrs in { let VAddrDwords = addr.NumWords in { if op.HAS_BASE then { def _V # addr.NumWords : MIMG_Sampler_Helper ; + foreach _ = BoolToList.ret in + def _V # addr.NumWords # _gfx90a + : MIMG_Sampler_gfx90a ; def _V # addr.NumWords # _gfx10 : MIMG_Sampler_gfx10 ; @@ -721,7 +816,8 @@ multiclass MIMG_Sampler { + string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", ""), + bit ExtendedImageInst = !ne(sample.LowerCaseMod, "")> { def "" : MIMG_Sampler_BaseOpcode { let HasD16 = !not(isGetLod); let G16 = isG16; @@ -730,15 +826,15 @@ let BaseOpcode = !cast(NAME), WQM = wqm, mayLoad = !not(isGetLod) in { let VDataDwords = 1 in - defm _V1 : MIMG_Sampler_Src_Helper; + defm _V1 : MIMG_Sampler_Src_Helper; let VDataDwords = 2 in - defm _V2 : MIMG_Sampler_Src_Helper; + defm _V2 : MIMG_Sampler_Src_Helper; let VDataDwords = 3 in - defm _V3 : MIMG_Sampler_Src_Helper; + defm _V3 : MIMG_Sampler_Src_Helper; let VDataDwords = 4 in - defm _V4 : MIMG_Sampler_Src_Helper; + defm _V4 : MIMG_Sampler_Src_Helper; let VDataDwords = 5 in - defm _V5 : MIMG_Sampler_Src_Helper; + defm _V5 : MIMG_Sampler_Src_Helper; } } @@ -848,7 +944,9 @@ defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic , "image_atomic_fcmpswap", 0, 1>; defm IMAGE_ATOMIC_FMIN : MIMG_Atomic , "image_atomic_fmin", 0, 1>; defm IMAGE_ATOMIC_FMAX : MIMG_Atomic , "image_atomic_fmax", 0, 1>; + defm IMAGE_SAMPLE : MIMG_Sampler_WQM , AMDGPUSample>; +let OtherPredicates = [HasExtendedImageInsts] in { defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM , AMDGPUSample_cl>; defm IMAGE_SAMPLE_D : MIMG_Sampler , AMDGPUSample_d>; defm IMAGE_SAMPLE_D_CL : MIMG_Sampler , AMDGPUSample_d_cl>; @@ -932,12 +1030,12 @@ defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler , AMDGPUSample_cd_cl_o, 0, 1>; defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler , AMDGPUSample_c_cd_o, 0, 1>; defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler , AMDGPUSample_c_cd_cl_o, 0, 1>; - -//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>; -//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>; +} // End OtherPredicates = [HasExtendedImageInsts] +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; let SubtargetPredicate = HasGFX10_BEncoding in -defm IMAGE_MSAA_LOAD : MIMG_NoSampler , "image_msaa_load", 1>; +defm IMAGE_MSAA_LOAD : MIMG_NoSampler , "image_msaa_load", 1, 0, 0, 1>; defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay, "image_bvh_intersect_ray", 11, 0>; defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay, "image_bvh_intersect_ray", 8, 1>; Index: llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -77,7 +77,7 @@ if (!TFE && !LWE) // intersect_ray continue; - unsigned TFEVal = TFE->getImm(); + unsigned TFEVal = TFE ? TFE->getImm() : 0; unsigned LWEVal = LWE->getImm(); unsigned D16Val = D16 ? D16->getImm() : 0; Index: llvm/lib/Target/AMDGPU/SIDefines.h =================================================================== --- llvm/lib/Target/AMDGPU/SIDefines.h +++ llvm/lib/Target/AMDGPU/SIDefines.h @@ -142,6 +142,8 @@ OPERAND_REG_IMM_FP16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_V2INT32, + OPERAND_REG_IMM_V2FP32, /// Operands with register or inline constant OPERAND_REG_INLINE_C_INT16, @@ -150,25 +152,30 @@ OPERAND_REG_INLINE_C_FP16, OPERAND_REG_INLINE_C_FP32, OPERAND_REG_INLINE_C_FP64, - OPERAND_REG_INLINE_C_V2FP16, OPERAND_REG_INLINE_C_V2INT16, + OPERAND_REG_INLINE_C_V2FP16, + OPERAND_REG_INLINE_C_V2INT32, + OPERAND_REG_INLINE_C_V2FP32, /// Operands with an AccVGPR register or inline constant OPERAND_REG_INLINE_AC_INT16, OPERAND_REG_INLINE_AC_INT32, OPERAND_REG_INLINE_AC_FP16, OPERAND_REG_INLINE_AC_FP32, - OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_FP64, OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_AC_V2FP16, + OPERAND_REG_INLINE_AC_V2INT32, + OPERAND_REG_INLINE_AC_V2FP32, OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, - OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_V2FP32, OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, - OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_AC_V2FP32, OPERAND_REG_INLINE_AC_FIRST = OPERAND_REG_INLINE_AC_INT16, - OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2INT16, + OPERAND_REG_INLINE_AC_LAST = OPERAND_REG_INLINE_AC_V2FP32, OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, @@ -680,6 +687,8 @@ BCAST31 = 0x143, DPP_UNUSED8_FIRST = 0x144, DPP_UNUSED8_LAST = 0x14F, + ROW_NEWBCAST_FIRST= 0x150, + ROW_NEWBCAST_LAST = 0x15F, ROW_SHARE_FIRST = 0x150, ROW_SHARE_LAST = 0x15F, ROW_XMASK_FIRST = 0x160, Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -97,6 +97,9 @@ std::pair isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); + bool tryFoldRegSeqence(MachineInstr &MI); + bool tryFoldLCSSAPhi(MachineInstr &MI); + bool tryFoldLoad(MachineInstr &MI); public: SIFoldOperands() : MachineFunctionPass(ID) { @@ -135,6 +138,8 @@ return AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: return AMDGPU::V_FMA_LEGACY_F32_e64; + case AMDGPU::V_FMAC_F64_e64: + return AMDGPU::V_FMA_F64_e64; } return AMDGPU::INSTRUCTION_LIST_END; } @@ -531,8 +536,10 @@ return false; uint8_t OpTy = OpInfo[UseOpIdx].OperandType; - if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || - OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) && + (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST)) return false; if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && @@ -554,6 +561,19 @@ return false; MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + + // Maybe it is just a COPY of an immediate itself. + MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) { + MachineOperand &DefOp = Def->getOperand(1); + if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm()); + return true; + } + } + SmallVector, 32> Defs; if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI)) return false; @@ -825,6 +845,10 @@ else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64)); + else if (ST->hasGFX90AInsts() && + TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32)); return; } @@ -1502,6 +1526,194 @@ return true; } +// Try to fold a reg_sequence with vgpr output and agpr inputs into an +// instruction which can take an agpr. So far that means a store. +bool SIFoldOperands::tryFoldRegSeqence(MachineInstr &MI) { + assert(MI.isRegSequence()); + auto Reg = MI.getOperand(0).getReg(); + + if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) || + !MRI->hasOneNonDBGUse(Reg)) + return false; + + SmallVector, 32> Defs; + if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER, TII, *MRI)) + return false; + + for (auto &Def : Defs) { + const auto *Op = Def.first; + if (!Op->isReg()) + return false; + if (TRI->isAGPR(*MRI, Op->getReg())) + continue; + // Maybe this is a COPY from AREG + const MachineInstr *SubDef = MRI->getUniqueVRegDef(Op->getReg()); + if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg()) + return false; + if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg())) + return false; + } + + MachineOperand *Op = &*MRI->use_nodbg_begin(Reg); + MachineInstr *UseMI = Op->getParent(); + while (UseMI->isCopy() && !Op->getSubReg()) { + Reg = UseMI->getOperand(0).getReg(); + if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg)) + return false; + Op = &*MRI->use_nodbg_begin(Reg); + UseMI = Op->getParent(); + } + + if (Op->getSubReg()) + return false; + + unsigned OpIdx = Op - &UseMI->getOperand(0); + const MCInstrDesc &InstDesc = UseMI->getDesc(); + const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + switch (OpInfo.RegClass) { + case AMDGPU::AV_32RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_64RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_96RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_128RegClassID: LLVM_FALLTHROUGH; + case AMDGPU::AV_160RegClassID: + break; + default: + return false; + } + + const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)); + auto Dst = MRI->createVirtualRegister(NewDstRC); + auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::REG_SEQUENCE), Dst); + + for (unsigned I = 0; I < Defs.size(); ++I) { + MachineOperand *Def = Defs[I].first; + Def->setIsKill(false); + if (TRI->isAGPR(*MRI, Def->getReg())) { + RS.add(*Def); + } else { // This is a copy + MachineInstr *SubDef = MRI->getUniqueVRegDef(Def->getReg()); + SubDef->getOperand(1).setIsKill(false); + RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg()); + } + RS.addImm(Defs[I].second); + } + + Op->setReg(Dst); + if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) { + Op->setReg(Reg); + RS->eraseFromParent(); + return false; + } + + LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI << '\n'); + + return true; +} + +// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI. +// This should allow folding of an AGPR into a consumer which may support it. +// I.e.: +// +// loop: // loop: +// %1:vreg = COPY %0:areg // exit: +// exit: => // %1:areg = PHI %0:areg, %loop +// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg +bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) { + assert(PHI.isPHI()); + + if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI + return false; + + Register PhiIn = PHI.getOperand(1).getReg(); + Register PhiOut = PHI.getOperand(0).getReg(); + if (PHI.getOperand(1).getSubReg() || + !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut)) + return false; + + // A single use should not matter for correctness, but if it has another use + // inside the loop we may perform copy twice in a worst case. + if (!MRI->hasOneNonDBGUse(PhiIn)) + return false; + + MachineInstr *Copy = MRI->getUniqueVRegDef(PhiIn); + if (!Copy || !Copy->isCopy()) + return false; + + Register CopyIn = Copy->getOperand(1).getReg(); + if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg()) + return false; + + const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn); + Register NewReg = MRI->createVirtualRegister(ARC); + PHI.getOperand(1).setReg(CopyIn); + PHI.getOperand(0).setReg(NewReg); + + MachineBasicBlock *MBB = PHI.getParent(); + BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(), + TII->get(AMDGPU::COPY), PhiOut) + .addReg(NewReg, RegState::Kill); + Copy->eraseFromParent(); // We know this copy had a single use. + + LLVM_DEBUG(dbgs() << "Folded " << PHI << '\n'); + + return true; +} + +// Attempt to convert VGPR load to an AGPR load. +bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { + assert(MI.mayLoad()); + if (!ST->hasGFX90AInsts() || !MI.getNumOperands()) + return false; + + MachineOperand &Def = MI.getOperand(0); + if (!Def.isDef()) + return false; + + Register DefReg = Def.getReg(); + + if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg)) + return false; + + SmallVector Users; + SmallVector MoveRegs; + for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg)) { + Users.push_back(&I); + } + if (Users.empty()) + return false; + + // Check that all uses a copy to an agpr or a reg_sequence producing an agpr. + while (!Users.empty()) { + const MachineInstr *I = Users.pop_back_val(); + if (!I->isCopy() && !I->isRegSequence()) + return false; + Register DstReg = I->getOperand(0).getReg(); + if (TRI->isAGPR(*MRI, DstReg)) + continue; + MoveRegs.push_back(DstReg); + for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg)) { + Users.push_back(&U); + } + } + + const TargetRegisterClass *RC = MRI->getRegClass(DefReg); + MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC)); + if (!TII->isOperandLegal(MI, 0, &Def)) { + MRI->setRegClass(DefReg, RC); + return false; + } + + while (!MoveRegs.empty()) { + Register Reg = MoveRegs.pop_back_val(); + MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg))); + } + + LLVM_DEBUG(dbgs() << "Folded " << MI << '\n'); + + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -1529,6 +1741,15 @@ tryFoldInst(TII, &MI); + if (MI.isRegSequence() && tryFoldRegSeqence(MI)) + continue; + + if (MI.isPHI() && tryFoldLCSSAPhi(MI)) + continue; + + if (MI.mayLoad() && tryFoldLoad(MI)) + continue; + if (!TII->isFoldableCopy(MI)) { // Saw an unknown clobber of m0, so we no longer know what it is. if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) Index: llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -188,11 +188,24 @@ return MaskA.getHighestLane() > MaskB.getHighestLane(); }); + MCRegister RepReg; + for (MCRegister R : *MRI->getRegClass(Reg)) { + if (!MRI->isReserved(R)) { + RepReg = R; + break; + } + } + if (!RepReg) + llvm_unreachable("Failed to find required allocatable register"); + for (unsigned Idx : CoveringSubregs) { LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) continue; + if (MRI->isReserved(TRI->getSubReg(RepReg, Idx))) + continue; + Func(Idx); LaneMask &= ~SubRegMask; if (LaneMask.none()) @@ -261,7 +274,7 @@ // tracking does not account for the alignment requirements for SGPRs, or the // fragmentation of registers the allocator will need to satisfy. if (Occupancy >= MFI->getMinAllowedOccupancy() && - MaxPressure.getVGPRNum() <= MaxVGPRs / 2 && + MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 && MaxPressure.getSGPRNum() <= MaxSGPRs / 2) { LastRecordedOccupancy = Occupancy; return true; Index: llvm/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -138,6 +138,7 @@ .addImm(0) // glc .addImm(0) // slc .addImm(0) // dlc + .addImm(0) // scc .addMemOperand(MMO); return; } @@ -152,6 +153,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .addMemOperand(MMO); return; } @@ -181,6 +183,7 @@ .addImm(0) // glc .addImm(0) // slc .addImm(0) // dlc + .addImm(0) // scc .addMemOperand(MMO); if (!HasOffsetReg) { @@ -207,6 +210,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .addMemOperand(MMO); } else { // No free register, use stack pointer and restore afterwards. @@ -224,6 +228,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .addMemOperand(MMO); BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SUB_U32), SPReg) @@ -257,6 +262,7 @@ .addImm(0) // glc .addImm(0) // slc .addImm(0) // dlc + .addImm(0) // scc .addMemOperand(MMO); return; } @@ -275,6 +281,7 @@ .addImm(0) // glc .addImm(0) // slc .addImm(0) // dlc + .addImm(0) // scc .addMemOperand(MMO); return; } @@ -290,6 +297,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .addMemOperand(MMO); return; } @@ -313,6 +321,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .addMemOperand(MMO); } @@ -1311,7 +1320,13 @@ const SIRegisterInfo *TRI = ST.getRegisterInfo(); // Ignore the SGPRs the default implementation found. - SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask()); + SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask()); + + // Do not save AGPRs prior to GFX90A because there was no easy way to do so. + // In gfx908 there was do AGPR loads and stores and thus spilling also + // require a temporary VGPR. + if (!ST.hasGFX90AInsts()) + SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict @@ -1366,7 +1381,7 @@ SavedRegs.reset(MFI->getStackPtrOffsetReg()); const BitVector AllSavedRegs = SavedRegs; - SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask()); + SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask()); // If clearing VGPRs changed the mask, we will have some CSR VGPR spills. const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -718,6 +718,19 @@ setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + + if (Subtarget->hasPackedFP32Ops()) { + setOperationAction(ISD::FADD, MVT::v2f32, Legal); + setOperationAction(ISD::FMUL, MVT::v2f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f32, Legal); + setOperationAction(ISD::FNEG, MVT::v2f32, Legal); + + for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) { + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } } setOperationAction(ISD::FNEG, MVT::v4f16, Custom); @@ -1128,17 +1141,6 @@ MachineMemOperand::MOVolatile; return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; - return true; - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SIMachineFunctionInfo *MFI = MF.getInfo(); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1150,6 +1152,22 @@ MachineMemOperand::MODereferenceable; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1191,6 +1209,9 @@ case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -1799,23 +1820,37 @@ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); - Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + unsigned Mask = (Subtarget->hasPackedTID() && + Info.hasWorkItemIDY()) ? 0x3ff : ~0u; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); } if (Info.hasWorkItemIDY()) { - Register Reg = AMDGPU::VGPR1; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 10)); + } else { + unsigned Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } } if (Info.hasWorkItemIDZ()) { - Register Reg = AMDGPU::VGPR2; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 20)); + } else { + unsigned Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } } } @@ -4380,7 +4415,8 @@ SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4401,7 +4437,8 @@ SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -6168,6 +6205,8 @@ if (IsGFX10Plus) Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); + if (!IsGFX10Plus) + Ops.push_back(DAG.getTargetConstant(0, SDLoc(), MVT::i1)); if (IsGFX10Plus) Ops.push_back(DLC); Ops.push_back(GLC); @@ -6176,8 +6215,12 @@ ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); if (IsGFX10Plus) Ops.push_back(IsA16 ? True : False); - Ops.push_back(TFE); - Ops.push_back(LWE); + if (!Subtarget->hasGFX90AInsts()) { + Ops.push_back(TFE); //tfe + } else if (cast(TFE)->getZExtValue()) { + report_fatal_error("TFE is not supported on this GPU"); + } + Ops.push_back(LWE); // lwe if (!IsGFX10Plus) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -6195,7 +6238,15 @@ : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); } else { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->hasGFX90AInsts()) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + report_fatal_error( + "requested image instruction is not supported on this GPU"); + } + if (Opcode == -1 && + Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) @@ -7062,7 +7113,7 @@ Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty()) { + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { DiagnosticInfoUnsupported NoFpRet(DAG.getMachineFunction().getFunction(), "return versions of fp atomics not supported", @@ -7083,6 +7134,14 @@ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); case Intrinsic::amdgcn_raw_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: @@ -7208,27 +7267,6 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_fadd: { - if (!Op.getValue(0).use_empty()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - MemSDNode *M = cast(Op); - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SDLoc DL(Op); MemSDNode *M = cast(Op); @@ -7299,7 +7337,55 @@ DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } + case Intrinsic::amdgcn_global_atomic_fadd: + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + LLVM_FALLTHROUGH; + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + MemSDNode *M = cast(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + unsigned Opcode = 0; + switch (IntrID) { + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: { + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); + } + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + } + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + } + default: + llvm_unreachable("unhandled atomic opcode"); + } + return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) return lowerImage(Op, ImageDimIntr, DAG, true); @@ -10813,7 +10899,7 @@ unsigned NewDmask = 0; unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; - bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; @@ -11768,26 +11854,39 @@ if (Ty->isHalfTy()) return AtomicExpansionKind::None; - if (!Ty->isFloatTy()) + if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; // TODO: Do have these for flat. Older targets also had them for buffers. unsigned AS = RMW->getPointerAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { + if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && + Subtarget->hasAtomicFaddInsts()) { if (!fpModeMatchesGlobalFPAtomicMode(RMW) || RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics") .getValueAsString() != "true") return AtomicExpansionKind::CmpXChg; + if (Subtarget->hasGFX90AInsts()) + return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ? + AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; + + if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS) + return AtomicExpansionKind::CmpXChg; + return RMW->use_empty() ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed // to round-to-nearest-even. - return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? - AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + // The only exception is DS_ADD_F64 which never flushes regardless of mode. + if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { + return (Ty->isDoubleTy() && !fpModeMatchesGlobalFPAtomicMode(RMW)) ? + AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; + } + + return AtomicExpansionKind::CmpXChg; } default: break; Index: llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -131,7 +131,8 @@ // We reserve a fixed number of VGPR slots in the scoring tables for // special tokens like SCMEM_LDS (needed for buffer load to LDS). enum RegisterMapping { - SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets. + SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. + AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets. SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses. @@ -451,8 +452,7 @@ const SIRegisterInfo *TRI, unsigned OpNo) const { const MachineOperand &Op = MI->getOperand(OpNo); - assert(Op.isReg()); - if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg())) + if (!TRI->isInAllocatableClass(Op.getReg())) return {-1, -1}; // A use via a PW operand does not need a waitcnt. @@ -463,9 +463,11 @@ unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); - if (TRI->isVGPR(*MRI, Op.getReg())) { + if (TRI->isVectorRegister(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); Result.first = Reg - RegisterEncoding.VGPR0; + if (TRI->isAGPR(*MRI, Op.getReg())) + Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); @@ -491,7 +493,7 @@ const MachineRegisterInfo *MRI, unsigned OpNo, unsigned Val) { RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); - assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg())); + assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, EXP_CNT, Val); } @@ -549,7 +551,8 @@ Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { const MachineOperand &Op = Inst.getOperand(I); - if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) { + if (Op.isReg() && !Op.isDef() && + TRI->isVectorRegister(*MRI, Op.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -606,7 +609,8 @@ } for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { MachineOperand &MO = Inst.getOperand(I); - if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) { + if (MO.isReg() && !MO.isDef() && + TRI->isVectorRegister(*MRI, MO.getReg())) { setExpScore(&Inst, TII, TRI, MRI, I, CurrScore); } } @@ -1003,7 +1007,7 @@ RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); - const bool IsVGPR = TRI->isVGPR(*MRI, Op.getReg()); + const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the @@ -1208,6 +1212,10 @@ if (!TII->usesLGKM_CNT(MI)) return false; + // If in tgsplit mode then there can be no use of LDS. + if (ST->isTgSplitEnabled()) + return false; + // If there are no memory operands then conservatively assume the flat // operation may access LDS. if (MI.memoperands_empty()) Index: llvm/lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -278,7 +278,7 @@ } class MIMGe : Enc64 { - bits<8> vdata; + bits<10> vdata; bits<4> dmask; bits<1> unorm; bits<1> glc; @@ -294,11 +294,10 @@ let Inst{12} = unorm; let Inst{13} = glc; let Inst{15} = r128; - let Inst{16} = tfe; let Inst{17} = lwe; let Inst{25} = slc; let Inst{31-26} = 0x3c; - let Inst{47-40} = vdata; + let Inst{47-40} = vdata{7-0}; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; let Inst{63} = d16; @@ -307,9 +306,25 @@ class MIMGe_gfx6789 op> : MIMGe { bits<8> vaddr; bits<1> da; + bits<1> sccb; + + let Inst{0} = op{7}; + let Inst{7} = sccb; + let Inst{14} = da; + let Inst{16} = tfe; + let Inst{24-18} = op{6-0}; + let Inst{39-32} = vaddr; +} + +class MIMGe_gfx90a op> : MIMGe { + bits<8> vaddr; + bits<1> da; + bits<1> sccb; let Inst{0} = op{7}; + let Inst{7} = sccb; let Inst{14} = da; + let Inst{16} = vdata{9}; // ACC bit let Inst{24-18} = op{6-0}; let Inst{39-32} = vaddr; } @@ -325,6 +340,7 @@ let Inst{2-1} = nsa; let Inst{5-3} = dim; let Inst{7} = dlc; + let Inst{16} = tfe; let Inst{24-18} = op{6-0}; let Inst{39-32} = vaddr0; let Inst{62} = a16; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -171,6 +171,10 @@ return RI; } + const GCNSubtarget &getSubtarget() const { + return ST; + } + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const override; @@ -1085,11 +1089,7 @@ const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) - const override { - if (OpNum >= TID.getNumOperands()) - return nullptr; - return RI.getRegClass(TID.OpInfo[OpNum].RegClass); - } + const override; void fixImplicitOperands(MachineInstr &MI) const; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -576,15 +576,18 @@ if (!Tmp) report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); RS.setRegUsed(Tmp); - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) - break; - Tmp = Tmp2; - RS.setRegUsed(Tmp); + + if (!TII.getSubtarget().hasGFX90AInsts()) { + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } } // Insert copy to temporary VGPR. @@ -782,7 +785,6 @@ return; } - if (RC == &AMDGPU::AGPR_32RegClass) { if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) @@ -790,6 +792,12 @@ return; } + if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + // FIXME: Pass should maintain scavenger to avoid scan through the block on // every AGPR spill. RegScavenger RS; @@ -797,7 +805,8 @@ return; } - if (RI.getRegSizeInBits(*RC) == 16) { + const unsigned Size = RI.getRegSizeInBits(*RC); + if (Size == 16) { assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || @@ -863,6 +872,24 @@ return; } + if (RC == &AMDGPU::VReg_64RegClass && + !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + if (ST.hasPackedFP32Ops()) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); + return; + } + } + const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); if (RI.isSGPRClass(RC)) { if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { @@ -873,12 +900,20 @@ return; } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.hasAGPRs(RC)) { - Opcode = RI.hasVGPRs(RI.getPhysRegClass(SrcReg)) ? + Opcode = (RI.hasVGPRs(RI.getPhysRegClass(SrcReg))) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; + } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && + !RI.hasAGPRs(RI.getPhysRegClass(SrcReg))) { + // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. + if (ST.hasPackedFP32Ops()) { + Opcode = AMDGPU::V_PK_MOV_B32; + EltSize = 8; + } } // For the cases where we need an intermediate instruction/temporary register @@ -890,7 +925,7 @@ if (Opcode == AMDGPU::INSTRUCTION_LIST_END) RS.reset(new RegScavenger()); - ArrayRef SubIndices = RI.getRegSplitParts(RC, 4); + ArrayRef SubIndices = RI.getRegSplitParts(RC, EltSize); // If there is an overlap, we can't kill the super-register on the last // instruction, since it will also kill the components made live by this def. @@ -911,6 +946,23 @@ indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, ImpDefSuper, ImpUseSuper); + } else if (Opcode == AMDGPU::V_PK_MOV_B32) { + Register DstSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + if (Idx == 0) + MIB.addReg(DestReg, RegState::Define | RegState::Implicit); } else { MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) @@ -1663,20 +1715,49 @@ assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + APInt Lo(32, Imm.getLoBits(32).getZExtValue()); + APInt Hi(32, Imm.getHiBits(32).getZExtValue()); + if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(SISrcMods::OP_SEL_1) + .addImm(Lo.getSExtValue()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addImm(Lo.getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addImm(Hi.getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); + } } else { assert(SrcOp.isReg()); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit | RegState::Define); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit | RegState::Define); + if (ST.hasPackedFP32Ops() && + !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) + .addImm(SISrcMods::OP_SEL_1) // src0_mod + .addReg(SrcOp.getReg()) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod + .addReg(SrcOp.getReg()) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0); // clamp + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit | RegState::Define); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit | RegState::Define); + } } MI.eraseFromParent(); break; @@ -1890,7 +1971,6 @@ unsigned Part = 0; MachineInstr *Split[2]; - for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); if (Dst.isPhysical()) { @@ -2609,6 +2689,7 @@ case AMDGPU::COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: + case AMDGPU::V_ACCVGPR_MOV_B32: return true; default: return false; @@ -2999,7 +3080,9 @@ unsigned Opc = MI.getOpcode(); bool IsF16 = false; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64; + Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; + bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; switch (Opc) { default: @@ -3010,13 +3093,15 @@ LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F64_e64: break; case AMDGPU::V_MAC_F16_e32: case AMDGPU::V_FMAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: - case AMDGPU::V_FMAC_F32_e32: { + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F64_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); @@ -3042,7 +3127,7 @@ const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); MachineInstrBuilder MIB; - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { @@ -3090,7 +3175,9 @@ } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 : AMDGPU::V_FMA_F32_e64) + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 + : IsF64 ? AMDGPU::V_FMA_F64_e64 + : AMDGPU::V_FMA_F32_e64) : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); if (pseudoToMCOpcode(NewOpc) == -1) return nullptr; @@ -3278,6 +3365,10 @@ case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: { int32_t Trunc = static_cast(Imm); @@ -3287,6 +3378,7 @@ case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: @@ -3398,6 +3490,10 @@ } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { + // GFX90A does not have V_MUL_LEGACY_F32_e32. + if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts()) + return false; + int Op32 = AMDGPU::getVOPe32(Opcode); if (Op32 == -1) return false; @@ -3455,6 +3551,7 @@ case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F64_e64: if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; @@ -3706,7 +3803,8 @@ case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; @@ -4203,12 +4301,68 @@ } if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST && ST.getGeneration() < AMDGPUSubtarget::GFX10) { + if (DC >= DppCtrl::ROW_NEWBCAST_FIRST && + DC <= DppCtrl::ROW_NEWBCAST_LAST && + !ST.hasGFX90AInsts()) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_newbroadcast/row_share is not supported before " + "GFX90A/GFX10"; + return false; + } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) { + ErrInfo = "Invalid dpp_ctrl value: " + "row_share and row_xmask are not supported before GFX10"; + return false; + } + } + + int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); + int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); + + if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && + ((DstIdx >= 0 && + Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID) || + ((Src0Idx >= 0 && + Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID))) && + !AMDGPU::isLegal64BitDPPControl(DC)) { ErrInfo = "Invalid dpp_ctrl value: " - "row_share and row_xmask are not supported before GFX10"; + "64 bit dpp only support row_newbcast"; return false; } } + if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) { + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata; + const MachineOperand *Data = getNamedOperand(MI, DataNameIdx); + const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1); + if (Data && !Data->isReg()) + Data = nullptr; + + if (ST.hasGFX90AInsts()) { + if (Dst && Data && + (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) { + ErrInfo = "Invalid register class: " + "vdata and vdst should be both VGPR or AGPR"; + return false; + } + if (Data && Data2 && + (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) { + ErrInfo = "Invalid register class: " + "both data operands should be VGPR or AGPR"; + return false; + } + } else { + if ((Dst && RI.isAGPR(MRI, Dst->getReg())) || + (Data && RI.isAGPR(MRI, Data->getReg())) || + (Data2 && RI.isAGPR(MRI, Data2->getReg()))) { + ErrInfo = "Invalid register class: " + "agpr loads and stores not supported on this GPU"; + return false; + } + } + } + return true; } @@ -4292,6 +4446,59 @@ "Unexpected scalar opcode without corresponding vector one!"); } +static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, + const MachineRegisterInfo &MRI, + const MCInstrDesc &TID, + unsigned RCID, + bool IsAllocatable) { + if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (TID.mayLoad() || TID.mayStore() || + (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { + switch (RCID) { + case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; + case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; + case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID; + case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID; + case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID; + default: + break; + } + } + return RCID; +} + +const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, + unsigned OpNum, const TargetRegisterInfo *TRI, + const MachineFunction &MF) + const { + if (OpNum >= TID.getNumOperands()) + return nullptr; + auto RegClass = TID.OpInfo[OpNum].RegClass; + bool IsAllocatable = false; + if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) { + // vdst and vdata should be both VGPR or AGPR, same for the DS instructions + // with two data operands. Request register class constainted to VGPR only + // of both operands present as Machine Copy Propagation can not check this + // constraint and possibly other passes too. + // + // The check is limited to FLAT and DS because atomics in non-flat encoding + // have their vdst and vdata tied to be the same register. + const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, + AMDGPU::OpName::vdst); + const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode, + (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0 + : AMDGPU::OpName::vdata); + if (DataIdx != -1) { + IsAllocatable = VDstIdx != -1 || + AMDGPU::getNamedOperandIdx(TID.Opcode, + AMDGPU::OpName::data1) != -1; + } + } + RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass, + IsAllocatable); + return RI.getRegClass(RegClass); +} + const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -4306,6 +4513,7 @@ } unsigned RCID = Desc.OpInfo[OpNo].RegClass; + RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true); return RI.getRegClass(RCID); } @@ -4482,7 +4690,40 @@ if (MO->isReg()) { assert(DefinedRC); - return isLegalRegOperand(MRI, OpInfo, *MO); + if (!isLegalRegOperand(MRI, OpInfo, *MO)) + return false; + bool IsAGPR = RI.isAGPR(MRI, MO->getReg()); + if (IsAGPR && !ST.hasMAIInsts()) + return false; + unsigned Opc = MI.getOpcode(); + if (IsAGPR && + (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && + (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc))) + return false; + // Atomics should have both vdst and vdata either vgpr or agpr. + const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + const int DataIdx = AMDGPU::getNamedOperandIdx(Opc, + isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata); + if ((int)OpIdx == VDstIdx && DataIdx != -1 && + MI.getOperand(DataIdx).isReg() && + RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR) + return false; + if ((int)OpIdx == DataIdx) { + if (VDstIdx != -1 && + RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR) + return false; + // DS instructions with 2 src operands also must have tied RC. + const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::data1); + if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() && + RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR) + return false; + } + if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) && + RI.isSGPRReg(MRI, MO->getReg())) + return false; + return true; } // Handle non-register types that are treated like immediates. @@ -5341,6 +5582,10 @@ getNamedOperand(MI, AMDGPU::OpName::dlc)) { MIB.addImm(DLC->getImm()); } + if (const MachineOperand *SCCB = + getNamedOperand(MI, AMDGPU::OpName::sccb)) { + MIB.addImm(SCCB->getImm()); + } MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); @@ -7085,7 +7330,8 @@ GFX80 = 4, GFX9 = 5, GFX10 = 6, - SDWA10 = 7 + SDWA10 = 7, + GFX90A = 8 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -7157,6 +7403,15 @@ if (MCOp == -1) return Opcode; + if (ST.hasGFX90AInsts()) { + uint16_t NMCOp = (uint16_t)-1; + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A); + if (NMCOp == (uint16_t)-1) + NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9); + if (NMCOp != (uint16_t)-1) + MCOp = NMCOp; + } + // (uint16_t)-1 means that Opcode is a pseudo instruction that has // no encoding in the given subtarget generation. if (MCOp == (uint16_t)-1) Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -28,6 +28,7 @@ int GFX9 = 5; int GFX10 = 6; int SDWA10 = 7; + int GFX90A = 8; } //===----------------------------------------------------------------------===// @@ -186,6 +187,8 @@ def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; +def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, @@ -265,21 +268,25 @@ !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v4f16.Value), !eq(SrcVT.Value, v2f32.Value), - !eq(SrcVT.Value, v2f64.Value)); + !eq(SrcVT.Value, v2f64.Value), + !eq(SrcVT.Value, v4f64.Value)); } class isIntType { bit ret = !or(!eq(SrcVT.Value, i16.Value), !eq(SrcVT.Value, i32.Value), - !eq(SrcVT.Value, i64.Value)); + !eq(SrcVT.Value, i64.Value), + !eq(SrcVT.Value, v2i32.Value)); } class isPackedType { bit ret = !or(!eq(SrcVT.Value, v2i16.Value), !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v4f16.Value)); + !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v2f32.Value)); } + //===----------------------------------------------------------------------===// // PatFrags for global memory operations //===----------------------------------------------------------------------===// @@ -822,6 +829,10 @@ return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); }]>; +def extract_sccb : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 4) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // Custom Operands //===----------------------------------------------------------------------===// @@ -1097,6 +1108,9 @@ def clampmod0 : NamedOperandBit_0<"ClampSI", NamedMatchClass<"ClampSI">>; def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; +def SCCB : NamedOperandBit<"SCCB", NamedMatchClass<"SCCB">>; +def SCCB_0 : NamedOperandBit_0<"SCCB", NamedMatchClass<"SCCB">>; + def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def DLC_0 : NamedOperandBit_0<"DLC", NamedMatchClass<"DLC">>; @@ -1243,7 +1257,7 @@ def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isVReg32"; + let PredicateMethod = "isVRegWithInputMods"; } def FPVRegInputMods : InputMods { @@ -1270,7 +1284,7 @@ def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isVReg32"; + let PredicateMethod = "isVRegWithInputMods"; } def IntVRegInputMods : InputMods { @@ -1507,8 +1521,12 @@ VSrc_128, !if(!eq(VT.Size, 64), !if(isFP, - VSrc_f64, - VSrc_b64), + !if(!eq(VT.Value, v2f32.Value), + VSrc_v2f32, + VSrc_f64), + !if(!eq(VT.Value, v2i32.Value), + VSrc_v2b32, + VSrc_b64)), !if(!eq(VT.Value, i1.Value), SSrc_i1, !if(isFP, @@ -1541,7 +1559,9 @@ !eq(SrcVT.Value, f32.Value), !eq(SrcVT.Value, f64.Value), !eq(SrcVT.Value, v2f16.Value), - !eq(SrcVT.Value, v2i16.Value)); + !eq(SrcVT.Value, v2i16.Value), + !eq(SrcVT.Value, v2f32.Value), + !eq(SrcVT.Value, v2i32.Value)); } // Return type of input modifiers operand for specified input operand @@ -1972,14 +1992,29 @@ string ret = dst#args#sdwa; } +class getHas64BitOps { + bit ret = !if(!eq(NumSrcArgs, 3), + 0, + !if(!eq(DstVT.Size, 64), + 1, + !if(!eq(Src0VT.Size, 64), + 1, + !if(!eq(Src1VT.Size, 64), + 1, + 0 + ) + ) + ) + ); +} -// Function that checks if instruction supports DPP and SDWA -class getHasExt { +class getHasSDWA { bit ret = !if(!eq(NumSrcArgs, 3), - 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3 + 0, // NumSrcArgs == 3 - No SDWA for VOP3 !if(!eq(DstVT.Size, 64), - 0, // 64-bit dst - No DPP or SDWA for 64-bit operands + 0, // 64-bit dst - No SDWA for 64-bit operands !if(!eq(Src0VT.Size, 64), 0, // 64-bit src0 !if(!eq(Src1VT.Size, 64), @@ -1993,8 +2028,42 @@ class getHasDPP { - bit ret = !if(!eq(NumSrcArgs, 0), 0, - getHasExt.ret); + bit ret = !if(!eq(NumSrcArgs, 3), + 0, // NumSrcArgs == 3 - No DPP for VOP3 + 1); +} + +class getHasExt64BitDPP { + bit ret = !and(getHasDPP.ret, + getHas64BitOps.ret); +} + +// Function that checks if instruction supports DPP and SDWA +class getHasExt { + bit ret = !or(getHasDPP.ret, + getHasSDWA.ret); +} + +// Return an AGPR+VGPR operand class for the given VGPR register class. +class getLdStRegisterOperand { + RegisterOperand ret = + !if(!eq(RC.Size, 32), AVLdSt_32, + !if(!eq(RC.Size, 64), AVLdSt_64, + !if(!eq(RC.Size, 96), AVLdSt_96, + !if(!eq(RC.Size, 128), AVLdSt_128, + !if(!eq(RC.Size, 160), AVLdSt_160, + RegisterOperand // invalid register + ))))); +} + +class BitOr { + bit ret = !if(a, 1, !if(b, 1, 0)); +} + +class BitAnd { + bit ret = !if(a, !if(b, 1, 0), 0); } def PatGenMode { @@ -2077,8 +2146,9 @@ field bit HasExt = getHasExt.ret; field bit HasExtDPP = getHasDPP.ret; - field bit HasExtSDWA = HasExt; - field bit HasExtSDWA9 = HasExt; + field bit HasExt64BitDPP = getHasExt64BitDPP.ret; + field bit HasExtSDWA = getHasSDWA.ret; + field bit HasExtSDWA9 = HasExtSDWA; field int NeedPatGen = PatGenMode.NoPattern; field bit IsMAI = 0; @@ -2144,6 +2214,7 @@ class VOP_NO_EXT : VOPProfile { let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -2191,6 +2262,7 @@ def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>; def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>; def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>; +def VOP_I64_I64 : VOPProfile <[i64, i64, untyped, untyped]>; def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>; def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>; @@ -2234,6 +2306,16 @@ def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>; def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>; +def VOP_V4F64_F64_F64_V4F64 : VOPProfile <[v4f64, f64, f64, v4f64]>; +def VOP_V1F64_F64_F64_V1F64 : VOPProfile <[v1f64, f64, f64, v1f64]>; + +def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>; +def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>; +def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>; +def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>; +def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>; +def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>; + class Commutable_REV { string RevOp = revOp; bit IsOrig = isOrig; @@ -2372,7 +2454,8 @@ [!cast(SIEncodingFamily.GFX80)], [!cast(SIEncodingFamily.GFX9)], [!cast(SIEncodingFamily.GFX10)], - [!cast(SIEncodingFamily.SDWA10)]]; + [!cast(SIEncodingFamily.SDWA10)], + [!cast(SIEncodingFamily.GFX90A)]]; } // Get equivalent SOPK instruction. Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -41,18 +41,21 @@ (i32 timm:$attrchan), (i32 timm:$attr), M0))] >; -let OtherPredicates = [has32BankLDS] in { +let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS] +} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { +let OtherPredicates = [has16BankLDS, isNotGFX90APlus], + Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 +} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], + // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 +let OtherPredicates = [isNotGFX90APlus] in { let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < @@ -73,6 +76,8 @@ [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; +} // End OtherPredicates = [isNotGFX90APlus] + } // End Uses = [MODE, M0, EXEC] //===----------------------------------------------------------------------===// @@ -86,11 +91,6 @@ let maybeAtomic = 1; } -def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> { - let HasExt = 1; - let HasExtDPP = 1; -} - let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns @@ -107,7 +107,7 @@ (ins VSrc_b64:$src0)>; // 64-bit vector move with dpp. Expanded post-RA. -def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> { +def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. } @@ -1373,6 +1373,19 @@ // sub1) // >; +// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead +// of the real value. +def : GCNPat < + (fneg (v2f32 SReg_64:$src)), + (v2f32 (REG_SEQUENCE SReg_64, + (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub0, + (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), + (i32 (S_MOV_B32 (i32 0x80000000)))), + SReg_32)), sub1)) +>; + } // End let AddedComplexity = 1 def : GCNPat < @@ -1437,6 +1450,15 @@ sub1) >; +def : GCNPat < + (getDivergentFrag.ret (v2f32 VReg_64:$src)), + (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, + 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0, + 0, 0, 0, 0, 0) +> { + let SubtargetPredicate = HasPackedFP32Ops; +} + def : GCNPat < (fcopysign f16:$src0, f16:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -1556,9 +1578,16 @@ /********** Intrinsic Patterns **********/ /********** ================== **********/ +let OtherPredicates = [isNotGFX90APlus] in // FIXME: Should use _e64 and select source modifiers. def : POW_Common ; +let OtherPredicates = [isGFX90APlus] in +def : GCNPat < + (fpow f32:$src0, f32:$src1), + (V_EXP_F32_e32 (V_MUL_LEGACY_F32_e64 0, f32:$src1, SRCMODS.NONE, (V_LOG_F32_e32 f32:$src0), 0, 0)) +>; + def : GCNPat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), @@ -2167,6 +2196,17 @@ SRCMODS.NONE, $src2) >; +let SubtargetPredicate = isGFX90APlus in +def : GCNPat < + (fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)), + (f64 (VOP3NoMods f64:$src2))), + (V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; + +// COPY is workaround tablegen bug from multiple outputs +// from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) @@ -2652,6 +2692,8 @@ def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); Index: llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -107,6 +107,7 @@ bool GLC; bool SLC; bool DLC; + bool SCCB; // vmem only. bool UseST64; int AddrIdx[MaxAddressRegs]; const MachineOperand *AddrReg[MaxAddressRegs]; @@ -199,6 +200,7 @@ const CombineInfo &Paired); const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired); + const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, SmallVectorImpl &InstsToMove); @@ -304,6 +306,16 @@ return 2; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: return 4; + case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B32_gfx9: + return 1; + case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; + case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; + case AMDGPU::DS_WRITE_B64_gfx9: + return 2; default: return 0; } @@ -526,6 +538,9 @@ SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); } DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); + if (InstClass != S_BUFFER_LOAD_IMM) { + SCCB = TII.getNamedOperand(*I, AMDGPU::OpName::sccb)->getImm(); + } } AddressRegs Regs = getRegs(Opc, TII); @@ -784,7 +799,8 @@ return (EltOffset0 + CI.Width == EltOffset1 || EltOffset1 + Paired.Width == EltOffset0) && CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && - (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); + (CI.InstClass == S_BUFFER_LOAD_IMM || + (CI.SLC == Paired.SLC && CI.SCCB == Paired.SCCB)); } // If the offset in elements doesn't fit in 8-bits, we might be able to use @@ -864,6 +880,26 @@ } } +const TargetRegisterClass * +SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { + if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { + return TRI->getRegClassForReg(*MRI, Dst->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { + return TRI->getRegClassForReg(*MRI, Dst->getReg()); + } + if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { + return TRI->getRegClassForReg(*MRI, Src->getReg()); + } + return nullptr; +} + /// This function assumes that CI comes before Paired in a basic block. bool SILoadStoreOptimizer::checkAndPrepareMerge( CombineInfo &CI, CombineInfo &Paired, @@ -896,6 +932,9 @@ DenseSet PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); + const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); + bool IsAGPR = TRI->hasAGPRs(DataRC); + MachineBasicBlock::iterator E = std::next(Paired.I); MachineBasicBlock::iterator MBBI = std::next(CI.I); MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); @@ -964,6 +1003,17 @@ continue; if (&*MBBI == &*Paired.I) { + if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) + return false; + // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data + // operands. However we are reporting that ds_write2 shall have + // only VGPR data so that machine copy propagation does not + // create an illegal instruction with a VGPR and AGPR sources. + // Consequenctially if we create such instruction the verifier + // will complain. + if (IsAGPR && CI.InstClass == DS_WRITE) + return false; + // We need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. @@ -1037,8 +1087,7 @@ const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC = - (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); @@ -1317,6 +1366,7 @@ .addImm(0) // tfe .addImm(CI.DLC) // dlc .addImm(0) // swz + .addImm(CI.SCCB) // scc .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI, Paired); @@ -1384,6 +1434,7 @@ .addImm(0) // tfe .addImm(CI.DLC) // dlc .addImm(0) // swz + .addImm(CI.SCCB) // scc .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1464,6 +1515,7 @@ .addImm(0) // tfe .addImm(CI.DLC) // dlc .addImm(0) // swz + .addImm(CI.SCCB) // scc .addMemOperand( combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); @@ -1559,18 +1611,27 @@ case 16: return &AMDGPU::SGPR_512RegClass; } - } else { - switch (CI.Width + Paired.Width) { - default: - return nullptr; - case 2: - return &AMDGPU::VReg_64RegClass; - case 3: - return &AMDGPU::VReg_96RegClass; - case 4: - return &AMDGPU::VReg_128RegClass; - } } + const TargetRegisterClass *RC = nullptr; + + switch (CI.Width + Paired.Width) { + default: + return nullptr; + case 2: + RC = &AMDGPU::VReg_64RegClass; + break; + case 3: + RC = &AMDGPU::VReg_96RegClass; + break; + case 4: + RC = &AMDGPU::VReg_128RegClass; + break; + } + + if (TRI->hasAGPRs(getDataRegClass(*CI.I))) + return TRI->getEquivalentAGPRClass(RC); + + return RC; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( @@ -1624,6 +1685,7 @@ .addImm(0) // tfe .addImm(CI.DLC) // dlc .addImm(0) // swz + .addImm(CI.SCCB) // scc .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, InstsToMove); Index: llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -300,6 +300,20 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const = 0; + /// Update \p MI memory store instruction to bypass any caches up to + /// the \p Scope memory scope for address spaces \p + /// AddrSpace. Return true iff the instruction was modified. + virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const = 0; + + /// Update \p MI memory read-modify-write instruction to bypass any caches up + /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true + /// iff the instruction was modified. + virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const = 0; + /// Update \p MI memory instruction of kind \p Op associated with address /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return /// true iff the instruction was modified. @@ -372,6 +386,14 @@ SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const override; + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, @@ -408,6 +430,55 @@ }; +class SIGfx90ACacheControl : public SIGfx7CacheControl { +protected: + + /// Sets SCC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSCCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit(MI); + } + +public: + + SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI, + SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, + bool IsNonTemporal) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; + + bool insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + class SIGfx10CacheControl : public SIGfx7CacheControl { protected: @@ -717,6 +788,8 @@ /* static */ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); + if (ST.hasGFX90AInsts()) + return std::make_unique(ST); if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) return std::make_unique(ST); if (Generation < AMDGPUSubtarget::GFX10) @@ -757,6 +830,32 @@ return Changed; } +bool SIGfx6CacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + /// The L1 cache is write through so does not need to be bypassed. There is no + /// bypass control for the L2 cache at the isa level. + + return Changed; +} + +bool SIGfx6CacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + /// The L1 cache is write through so does not need to be bypassed. There is no + /// bypass control for the L2 cache at the isa level. + + return Changed; +} + bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal( MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, bool IsVolatile, bool IsNonTemporal) const { @@ -1000,6 +1099,300 @@ return Changed; } +bool SIGfx90ACacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + Changed |= enableSCCBit(MI); + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to bypass the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be bypassed. + if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx90ACacheControl::enableStoreCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(!MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + Changed |= enableSCCBit(MI); + LLVM_FALLTHROUGH; + case SIAtomicScope::AGENT: + /// Do not set glc for store atomic operations as they implicitly write + /// through the L1 cache. + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. Store atomics implicitly write through the L1 + // cache. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + return Changed; +} + +bool SIGfx90ACacheControl::enableRMWCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && MI->mayStore()); + bool Changed = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + Changed |= enableSCCBit(MI); + LLVM_FALLTHROUGH; + case SIAtomicScope::AGENT: + /// Do not set glc for RMW atomic operations as they implicitly bypass + /// the L1 cache, and the glc bit is instead used to indicate if they are + /// return or no-return. + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. RMW atomics implicitly bypass the L1 cache. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + return Changed; +} + +bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal( + MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op, + bool IsVolatile, bool IsNonTemporal) const { + // Only handle load and store, not atomic read-modify-write insructions. The + // latter use glc to indicate if the atomic returns a result and so must not + // be used for cache control. + assert(MI->mayLoad() ^ MI->mayStore()); + + // Only update load and store, not LLVM IR atomic read-modify-write + // instructions. The latter are always marked as volatile so cannot sensibly + // handle it as do not want to pessimize all atomics. Also they do not support + // the nontemporal attribute. + assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE); + + bool Changed = false; + + if (IsVolatile) { + if (Op == SIMemOp::LOAD) { + Changed |= enableGLCBit(MI); + } + Changed |= enableSCCBit(MI); + + // Ensure operation has completed at system scope to cause all volatile + // operations to be visible outside the program in a global order. Do not + // request cross address space as only the global address space can be + // observable outside the program, so no need to cause a waitcnt for LDS + // address space operations. + Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false, + Position::AFTER); + + return Changed; + } + + if (IsNonTemporal) { + // Request L1 MISS_EVICT and L2 STREAM for load and store instructions. + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + return Changed; + } + + return Changed; +} + +bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + if (ST.isTgSplitEnabled()) { + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to wait for global or GDS memory operations + // to complete to ensure they are visible to waves in the other CUs. + // Otherwise in non-threadgroup split mode all waves of a work-group are on + // the same CU, so no need to wait for global memory as all waves in the + // work-group access the same the L1, nor wait for GDS as access are ordered + // on a CU. + if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) && + (Scope == SIAtomicScope::WORKGROUP)) { + // Same as GFX7 using agent scope. + Scope = SIAtomicScope::AGENT; + } + // In threadgroup split mode LDS cannot be allocated so no need to wait for + // LDS memory operations. + AddrSpace &= ~SIAtomicAddrSpace::LDS; + } + return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op, + IsCrossAddrSpaceOrdering, Pos); +} + +bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!InsertCacheInv) + return false; + + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Ensures that following loads will not see stale remote VMEM data or + // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and + // CC will never be stale due to the local memory probes. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2)); + // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to + // remove any cache lines of earlier writes by the same wave and ensures + // later reads by the same wave will refetch the cache lines. + Changed = true; + break; + case SIAtomicScope::AGENT: + // Same as GFX7. + break; + case SIAtomicScope::WORKGROUP: + // In threadgroup split mode the waves of a work-group can be executing on + // different CUs. Therefore need to invalidate the L1 which is per CU. + // Otherwise in non-threadgroup split mode all waves of a work-group are + // on the same CU, and so the L1 does not need to be invalidated. + if (ST.isTgSplitEnabled()) { + // Same as GFX7 using agent scope. + Scope = SIAtomicScope::AGENT; + } + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Same as GFX7. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + + if (Pos == Position::AFTER) + --MI; + + Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos); + + return Changed; +} + +bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the + // hardware does not reorder memory operations by the same wave with + // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed + // to initiate writeback of any dirty cache lines of earlier writes by the + // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the + // writeback has completed. + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2)); + // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT + // vmcnt(0)" needed by the "BUFFER_WBL2". + Changed = true; + break; + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // Same as GFX7. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if (Pos == Position::AFTER) + --MI; + + Changed |= + SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace, + IsCrossAddrSpaceOrdering, Pos); + + return Changed; +} + bool SIGfx10CacheControl::enableLoadCacheBypass( const MachineBasicBlock::iterator &MI, SIAtomicScope Scope, @@ -1324,6 +1717,13 @@ bool Changed = false; if (MOI.isAtomic()) { + if (MOI.getOrdering() == AtomicOrdering::Monotonic || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(), + MOI.getOrderingAddrSpace()); + } + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) Changed |= CC->insertRelease(MI, MOI.getScope(), @@ -1392,6 +1792,15 @@ bool Changed = false; if (MOI.isAtomic()) { + if (MOI.getOrdering() == AtomicOrdering::Monotonic || + MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(), + MOI.getInstrAddrSpace()); + } + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || Index: llvm/lib/Target/AMDGPU/SIProgramInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -41,10 +41,13 @@ uint32_t ScratchBlocks = 0; uint64_t ComputePGMRSrc2 = 0; + uint64_t ComputePGMRSrc3GFX90A = 0; uint32_t NumVGPR = 0; uint32_t NumArchVGPR = 0; uint32_t NumAccVGPR = 0; + uint32_t AccumOffset = 0; + uint32_t TgSplit = 0; uint32_t NumSGPR = 0; uint32_t LDSSize = 0; bool FlatUsed = false; Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -279,6 +279,8 @@ LiveIntervals *LIS) const; const uint32_t *getAllVGPRRegMask() const; + const uint32_t *getAllAGPRRegMask() const; + const uint32_t *getAllVectorRegMask() const; const uint32_t *getAllAllocatableSRegMask() const; // \returns number of 32 bit registers covered by a \p LM Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -122,7 +122,9 @@ case CallingConv::Fast: case CallingConv::Cold: case CallingConv::AMDGPU_Gfx: - return CSR_AMDGPU_HighRegs_SaveList; + return MF->getSubtarget().hasGFX90AInsts() + ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList + : CSR_AMDGPU_HighRegs_SaveList; default: { // Dummy to not crash RegisterClassInfo. static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister; @@ -143,7 +145,9 @@ case CallingConv::Fast: case CallingConv::Cold: case CallingConv::AMDGPU_Gfx: - return CSR_AMDGPU_HighRegs_RegMask; + return MF.getSubtarget().hasGFX90AInsts() + ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask + : CSR_AMDGPU_HighRegs_RegMask; default: return nullptr; } @@ -181,6 +185,14 @@ return CSR_AMDGPU_AllVGPRs_RegMask; } +const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const { + return CSR_AMDGPU_AllAGPRs_RegMask; +} + +const uint32_t *SIRegisterInfo::getAllVectorRegMask() const { + return CSR_AMDGPU_AllVectorRegs_RegMask; +} + const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const { return CSR_AMDGPU_AllAllocatableSRegs_RegMask; } @@ -263,6 +275,12 @@ } unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); + // TODO: In an entry function without calls and AGPRs used it is possible + // to use the whole register budget for VGPRs. Even more it shall + // be possible to estimate maximum AGPR/VGPR pressure and split + // register file accordingly. + if (ST.hasGFX90AInsts()) + MaxNumVGPRs /= 2; unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) { unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); @@ -327,6 +345,13 @@ reserveRegisterTuples(Reserved, Reg); } + if (ST.hasGFX90AInsts()) + for (const TargetRegisterClass *RC : this->regclasses()) + if (getRegSizeInBits(*RC) > 32 && hasVectorRegisters(RC)) + for (unsigned Reg : *RC) + if (getEncodingValue(Reg) & 1) + Reserved.set(Reg); + // FIXME: Stop using reserved registers for this. for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs()) reserveRegisterTuples(Reserved, Reg); @@ -730,6 +755,7 @@ .addImm(0) // tfe .addImm(0) // dlc .addImm(0) // swz + .addImm(0) // scc .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -798,7 +824,8 @@ MCRegister SOffset = ScratchOffsetReg; const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); - const bool IsAGPR = hasAGPRs(RC); + // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. + const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC); const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; // Always use 4 byte operations for AGPRs because we need to scavenge @@ -996,6 +1023,7 @@ if (!IsFlat) MIB.addImm(0) // dlc .addImm(0); // swz + MIB.addImm(0); // scc MIB.addMemOperand(NewMMO); if (!IsAGPR && NeedSuperRegDef) @@ -2055,6 +2083,13 @@ unsigned DstSize = getRegSizeInBits(*DstRC); unsigned NewSize = getRegSizeInBits(*NewRC); + // Do not allow coalescing between an odd and an even lanes as it will + // result in misaligned tuple access. + if (ST.hasGFX90AInsts() && !isSGPRClass(NewRC) && + (getChannelFromSubReg(DstSubReg) & 1) != + (getChannelFromSubReg(SubReg) & 1)) + return false; + // Do not increase size of registers beyond dword, we would need to allocate // adjacent registers and constraint regalloc more than needed. Index: llvm/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -847,21 +847,36 @@ let isAllocatable = 0; } -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { +def VS_64 : RegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> { let isAllocatable = 0; } -def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, +def AV_32 : RegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add AGPR_32, VGPR_32)> { let isAllocatable = 0; } -def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32, +def AV_64 : RegisterClass<"AMDGPU", VReg_64.RegTypes, 32, (add AReg_64, VReg_64)> { let isAllocatable = 0; } } // End GeneratePressureSet = 0 +def AV_96 : RegisterClass<"AMDGPU", VReg_96.RegTypes, 32, + (add AReg_96, VReg_96)> { + let isAllocatable = 0; +} + +def AV_128 : RegisterClass<"AMDGPU", VReg_128.RegTypes, 32, + (add AReg_128, VReg_128)> { + let isAllocatable = 0; +} + +def AV_160 : RegisterClass<"AMDGPU", VReg_160.RegTypes, 32, + (add AReg_160, VReg_160)> { + let isAllocatable = 0; +} + //===----------------------------------------------------------------------===// // Register operands //===----------------------------------------------------------------------===// @@ -912,21 +927,38 @@ } } -multiclass SIRegOperand : - SIRegOperand32 { +multiclass SIRegOperand64 { let OperandNamespace = "AMDGPU" in { - def _b64 : RegisterOperand(rc#"_64")> { + def _b64 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_INT64"; let ParserMatchClass = RegImmMatcher; } - def _f64 : RegisterOperand(rc#"_64")> { + def _f64 : RegisterOperand(rc#rc_suffix)> { let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher; } + + foreach _ = BoolToList.ret in + def _v2f32 : RegisterOperand(rc#rc_suffix)> { + let OperandType = opType#"_V2FP32"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrcV232"; + } + foreach _ = BoolToList.ret in + def _v2b32 : RegisterOperand(rc#rc_suffix)> { + let OperandType = opType#"_V2INT32"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrcV232"; + } } } +multiclass SIRegOperand : + SIRegOperand32, + SIRegOperand64; + // FIXME: 64-bit sources can sometimes use 32-bit constants. multiclass RegImmOperand : SIRegOperand; @@ -938,10 +970,18 @@ string rc_suffix = "_32"> : SIRegOperand32; +multiclass RegInlineOperand64 + : SIRegOperand64; + multiclass RegInlineOperandAC : SIRegOperand32; +multiclass RegInlineOperandAC64 + : SIRegOperand64; + //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// @@ -1001,6 +1041,13 @@ //===----------------------------------------------------------------------===// defm VISrc : RegInlineOperand32<"VGPR", "VISrc">; +let DecoderMethod = "decodeOperand_VReg_64" in +defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">; +defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">; +let DecoderMethod = "decodeOperand_VReg_256" in +defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">; +defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">; +defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">; //===----------------------------------------------------------------------===// // AVSrc_* Operands with an AGPR or VGPR @@ -1016,6 +1063,31 @@ let EncoderMethod = "getAVOperandEncoding"; } +def AVLdSt_32 : RegisterOperand { + let DecoderMethod = "DecodeAVLdSt_32RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_64 : RegisterOperand { + let DecoderMethod = "DecodeAVLdSt_64RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_96 : RegisterOperand { + let DecoderMethod = "DecodeAVLdSt_96RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_128 : RegisterOperand { + let DecoderMethod = "DecodeAVLdSt_128RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + +def AVLdSt_160 : RegisterOperand { + let DecoderMethod = "DecodeAVLdSt_160RegisterClass"; + let EncoderMethod = "getAVOperandEncoding"; +} + //===----------------------------------------------------------------------===// // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// @@ -1024,3 +1096,8 @@ defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">; defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">; defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">; + +let DecoderMethod = "decodeOperand_AReg_64" in +defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">; +let DecoderMethod = "decodeOperand_AReg_256" in +defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">; Index: llvm/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/lib/Target/AMDGPU/SISchedule.td +++ llvm/lib/Target/AMDGPU/SISchedule.td @@ -54,10 +54,15 @@ // Half rate 64-bit instructions. def Write64Bit : SchedWrite; +// Integer multiplications. +def WriteIntMul : SchedWrite; + // mAI multipass instructions. def Write2PassMAI : SchedWrite; def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; +def Write4PassDGEMM : SchedWrite; +def Write8PassDGEMM : SchedWrite; // FIXME: Should there be a class for instructions which are VALU // instructions and have VALU rates, but write to the SALU (i.e. VOPC @@ -80,6 +85,7 @@ def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def SIDPFullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? @@ -137,11 +143,13 @@ def : HWWriteRes; // XXX: Guessed ??? def : HWVALUWriteRes; - def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; + def : HWVALUWriteRes; + def : HWVALUWriteRes; + let ResourceCycles = [2] in def : HWWriteRes; let ResourceCycles = [8] in @@ -150,7 +158,6 @@ def : HWWriteRes; def : ReadAdvance; - def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; // Technically mfma reads can be from 0 to 4 cycles but that does not make // sense to model because its register setup is huge. In particular if we @@ -159,10 +166,6 @@ // need to consume 2 or 4 more vgprs to be initialized before the acc // write sequence. Just assume worst case here. def : ReadAdvance; - - def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; - def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; - def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; @@ -176,11 +179,13 @@ defm : SICommonWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -190,16 +195,44 @@ defm : SICommonWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; -def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>; } // End SchedModel = SIQuarterSpeedModel +let SchedModel = SIDPFullSpeedModel in { + +defm : SICommonWriteRes; + +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; +def : HWVALUWriteRes; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X")>; +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +} // End SchedModel = SIDPFullSpeedModel + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). @@ -213,6 +246,7 @@ def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; +def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; Index: llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -407,6 +407,9 @@ char Flags = 0; if (TII->isWQM(Opcode)) { + // If LOD is not supported WQM is not needed. + if (!ST->hasExtendedImageInsts()) + continue; // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -703,6 +703,7 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI); +bool isGFX90A(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); @@ -746,12 +747,17 @@ case AMDGPU::OPERAND_REG_INLINE_C_FP32: case AMDGPU::OPERAND_REG_INLINE_AC_INT32: case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: return 4; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return 8; case AMDGPU::OPERAND_REG_IMM_INT16: @@ -847,6 +853,11 @@ const GCNSubtarget *Subtarget, Align Alignment = Align(4)); +LLVM_READNONE +inline bool isLegal64BitDPPControl(unsigned DC) { + return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; +} + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -345,7 +345,7 @@ << "gfx" << Version.Major << Version.Minor - << Version.Stepping; + << hexdigit(Version.Stepping, true); if (hasXNACK(*STI)) Stream << "+xnack"; @@ -402,6 +402,8 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) { // FIXME: Need to take scratch memory into account. + if (isGFX90A(*STI)) + return 8; if (!isGFX10Plus(*STI)) return 10; return hasGFX10_3Insts(*STI) ? 16 : 20; @@ -531,6 +533,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, Optional EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; + bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); @@ -543,6 +548,8 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, Optional EnableWavefrontSize32) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 8; bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : @@ -552,12 +559,16 @@ } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; if (!isGFX10Plus(*STI)) return 256; return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512; } unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureGFX90AInsts)) + return 512; return 256; } @@ -653,6 +664,11 @@ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); } + if (AMDGPU::isGFX90A(*STI)) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc3, + amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, + STI->getFeatureBits().test(FeatureTgSplit) ? 1 : 0); + } return KD; } @@ -1267,6 +1283,10 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; } +bool isGFX90A(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); @@ -1374,6 +1394,9 @@ case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: return true; default: return false; @@ -1418,16 +1441,19 @@ case AMDGPU::SReg_96RegClassID: case AMDGPU::VReg_96RegClassID: case AMDGPU::AReg_96RegClassID: + case AMDGPU::AV_96RegClassID: return 96; case AMDGPU::SGPR_128RegClassID: case AMDGPU::SReg_128RegClassID: case AMDGPU::VReg_128RegClassID: case AMDGPU::AReg_128RegClassID: + case AMDGPU::AV_128RegClassID: return 128; case AMDGPU::SGPR_160RegClassID: case AMDGPU::SReg_160RegClassID: case AMDGPU::VReg_160RegClassID: case AMDGPU::AReg_160RegClassID: + case AMDGPU::AV_160RegClassID: return 160; case AMDGPU::SGPR_192RegClassID: case AMDGPU::SReg_192RegClassID: Index: llvm/lib/Target/AMDGPU/VOP1Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -351,12 +351,12 @@ VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>; } // End TRANS = 1, SchedRW = [WriteTrans32] - let SchedRW = [WriteDouble] in { + let SchedRW = [WriteTrans64] in { defm V_RCP_CLAMP_F64 : VOP1Inst<"v_rcp_clamp_f64", VOP_F64_F64>; defm V_RSQ_CLAMP_F64 : VOP1Inst<"v_rsq_clamp_f64", VOP_F64_F64, AMDGPUrsq_clamp>; - } // End SchedRW = [WriteDouble] + } // End SchedRW = [WriteTrans64] } // End SubtargetPredicate = isGFX6GFX7 let SubtargetPredicate = isGFX7GFX8GFX9 in { @@ -461,6 +461,18 @@ } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus +def VOPProfileAccMov : VOP_NO_EXT { + let DstRC = RegisterOperand; + let Src0RC32 = RegisterOperand; + let Asm32 = " $vdst, $src0"; +} + +def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1> { + let SubtargetPredicate = isGFX90APlus; + let isReMaterializable = 1; + let isAsCheapAsAMove = 1; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// @@ -823,6 +835,8 @@ defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>; defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>; +defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>; + // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, // and an implicit use and def of the super register should be added. Index: llvm/lib/Target/AMDGPU/VOP2Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -289,28 +289,30 @@ def VOP_MADMK_F16 : VOP_MADMK ; def VOP_MADMK_F32 : VOP_MADMK ; +class getRegisterOperandForVT { + RegisterOperand ret = RegisterOperand.ret>; +} + // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); - let Ins64 = getIns64, 3, + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT.ret:$src2); + let Ins64 = getIns64.ret, 3, 0, HasModifiers, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT.ret:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsDPP16 = !con(InsDPP, (ins FI:$fi)); - let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT.ret:$src2, // stub argument dpp8:$dpp8, FI:$fi); - let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, - VGPR_32:$src2, // stub argument + getVregSrcForVT.ret:$src2, // stub argument clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); @@ -335,6 +337,8 @@ def VOP_MAC_F32 : VOP_MAC ; let HasExtDPP = 0 in def VOP_MAC_LEGACY_F32 : VOP_MAC ; +let HasExtSDWA = 0, HasExt64BitDPP = 1 in +def VOP_MAC_F64 : VOP_MAC ; class VOP_DOT_ACC : VOP_MAC { let HasClamp = 0; @@ -448,6 +452,7 @@ let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -464,6 +469,7 @@ let HasExt = 0; let HasExtDPP = 0; + let HasExt64BitDPP = 0; let HasExtSDWA = 0; let HasExtSDWA9 = 0; } @@ -692,6 +698,14 @@ } // End SubtargetPredicate = HasFmaLegacy32 +let SubtargetPredicate = isGFX90APlus, + Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1, + SchedRW = [WriteDoubleAdd] in +defm V_FMAC_F64 : VOP2Inst <"v_fmac_f64", VOP_MAC_F64>; + let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1, @@ -1525,6 +1539,7 @@ defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>; defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>; defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>; +let AssemblerPredicate = isGCN3ExcludingGFX90A in defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>; defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>; defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>; @@ -1641,6 +1656,40 @@ } // End SubtargetPredicate = HasDLInsts +let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in { + multiclass VOP2_Real_e32_gfx90a op> { + def _e32_gfx90a : + VOP2_Real(NAME#"_e32"), SIEncodingFamily.GFX90A>, + VOP2e(NAME#"_e32").Pfl>; + } + + multiclass VOP2_Real_e64_gfx90a op> { + def _e64_gfx90a : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX90A>, + VOP3e_vi (NAME#"_e64").Pfl>; + } + + multiclass Base_VOP2_Real_e32e64_gfx90a op> : + VOP2_Real_e32_gfx90a, + VOP2_Real_e64_gfx90a<{0, 1, 0, 0, op{5-0}}>; + + multiclass VOP2_Real_e32e64_gfx90a op> : + Base_VOP2_Real_e32e64_gfx90a { + + foreach _ = BoolToList(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx90a : + VOP_DPP_Real(NAME#"_dpp"), SIEncodingFamily.GFX90A>, + VOP2_DPPe(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } + } +} // End AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" + +let SubtargetPredicate = isGFX90APlus in { + defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>; + defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>; +} // End SubtargetPredicate = isGFX90APlus + multiclass VOP2_Real_DOT_ACC_gfx9 op> : VOP2_Real_e32_vi { def _dpp_vi : VOP2_DPP(NAME#"_dpp")>; } Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -325,12 +325,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] -let SchedRW = [WriteQuarterRate32] in { +let SchedRW = [WriteIntMul] in { defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", VOP3_Profile, mul>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", VOP3_Profile, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile, mulhs>; -} // End SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteIntMul] let Uses = [MODE, VCC, EXEC] in { // v_div_fmas_f32: @@ -447,10 +447,10 @@ } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] let isCommutable = 1 in { -let SchedRW = [WriteQuarterRate32, WriteSALU] in { +let SchedRW = [WriteIntMul, WriteSALU] in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; -} // End SchedRW = [WriteQuarterRate32, WriteSALU] +} // End SchedRW = [WriteIntMul, WriteSALU] } // End isCommutable = 1 } // End SubtargetPredicate = isGFX7Plus @@ -476,6 +476,7 @@ let FPDPRounding = 1 in { defm V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile, fmad>; let Uses = [MODE, M0, EXEC] in { + let OtherPredicates = [isNotGFX90APlus] in // For some reason the intrinsic operands are in a different order // from the instruction operands. def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>, @@ -497,24 +498,24 @@ let SubtargetPredicate = isGFX9Plus in { defm V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile>; defm V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile>; +let OtherPredicates = [isNotGFX90APlus] in def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9Plus -let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { +// This predicate should only apply to the selection pattern. The +// instruction still exists and should decode on subtargets with +// other bank counts. +let OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>, [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers), (i32 timm:$attrchan), (i32 timm:$attr), - (i1 timm:$high), M0))]> { - // This predicate should only apply to the selection pattern. The - // instruction still exists and should decode on subtargets with - // other bank counts. - let OtherPredicates = [has32BankLDS]; -} - + (i1 timm:$high), M0))]>; +} // End OtherPredicates = [isNotGFX90APlus, has32BankLDS], Uses = [MODE, M0, EXEC], FPDPRounding = 1 +let OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; -} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1 +} // End OtherPredicates = [isNotGFX90APlus], Uses = [MODE, M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -527,11 +528,11 @@ ), VGPR_32)), sub1) >; -let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in { +let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; -} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] +} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { Index: llvm/lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -340,9 +340,16 @@ (!cast("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; def ADst_32 : VOPDstOperand; +def ADst_64 : VOPDstOperand; def ADst_128 : VOPDstOperand; +def ADst_256 : VOPDstOperand; def ADst_512 : VOPDstOperand; def ADst_1024 : VOPDstOperand; +def VDst_64 : VOPDstOperand; +def VDst_128 : VOPDstOperand; +def VDst_256 : VOPDstOperand; +def VDst_512 : VOPDstOperand; +def VDst_1024 : VOPDstOperand; def VOPProfileAccRead : VOP3_Profile { let Src0RC64 = ARegSrc_32; @@ -362,6 +369,9 @@ let Src2RC64 = _SrcRC; let HasOpSel = 0; let HasClamp = 0; + let HasIntClamp = 0; + let HasOMod = 0; + let HasModifiers = 0; let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); } @@ -378,6 +388,29 @@ def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI; def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI; def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI; +def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI; +def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI; + +def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI; +def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI; let Predicates = [HasMAIInsts] in { @@ -388,32 +421,57 @@ } // End isMoveImm = 1 } // End isAsCheapAsAMove = 1, isReMaterializable = 1 -// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. -let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { -defm V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; -defm V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; -defm V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; -defm V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; -defm V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; -defm V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; -defm V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; -defm V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; -defm V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; -defm V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; -defm V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; -defm V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; -defm V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; -defm V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; -defm V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; -defm V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; -defm V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; -defm V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; -defm V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; -defm V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; -} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 +multiclass MAIInst { + let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { + // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. + defm "" : VOP3Inst("VOPProfileMAI_" # P), node>; + + let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in + defm _vgprcd : VOP3Inst("VOPProfileMAI_" # P # "_VCD")>; + } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 +} + +defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; +defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; +defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; +defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; +defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; +defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>; +defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>; +defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>; +defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; +defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; +defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>; +defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>; +defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>; +defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>; +defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>; +defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>; +defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>; +defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>; +defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>; +defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>; } // End SubtargetPredicate = HasMAIInsts +let Predicates = [isGFX90APlus] in { + defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; + defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>; + defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>; + defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>; + defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>; + + defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>; + defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; +} // End Predicates = [isGFX90APlus] + +let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { + def V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; + def V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; + def V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile, any_fadd>; + def V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile>; +} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 + def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; @@ -435,7 +493,7 @@ multiclass VOP3P_Real_MAI op> { def _vi : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.VI>, - VOP3Pe_MAI (NAME#"_e64").Pfl> { + VOP3Pe_MAI (NAME#"_e64").Pfl, ?> { let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; let Inst{14} = ?; // op_sel_hi(2) @@ -444,9 +502,21 @@ } } -multiclass VOP3P_Real_MFMA op> { +multiclass VOP3P_Real_MFMA_gfx90a op> { + let SubtargetPredicate = isGFX90AOnly, + AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { + def _gfx90a_acd : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.GFX90A>, + VOP3Pe_MAI (NAME#"_e64").Pfl, 1>; + + def _gfx90a_vcd : VOP3P_Real(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>, + VOP3Pe_MAI (NAME # "_vgprcd" # "_e64").Pfl, 0>; + } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" +} + +multiclass VOP3P_Real_MFMA op> : + VOP3P_Real_MFMA_gfx90a { def _vi : VOP3P_Real(NAME#"_e64"), SIEncodingFamily.VI>, - VOP3Pe_MAI (NAME#"_e64").Pfl> { + VOP3Pe_MAI (NAME#"_e64").Pfl, ?> { let AssemblerPredicate = HasMAIInsts; let DecoderNamespace = "GFX8"; } @@ -536,6 +606,21 @@ } // End SubtargetPredicate = HasMAIInsts +defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>; +defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>; +defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>; +defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>; +defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; +defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; +defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; + +let SubtargetPredicate = HasPackedFP32Ops in { + defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; + defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; + defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; + defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; +} // End SubtargetPredicate = HasPackedFP32Ops + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/VOPInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOPInstructions.td +++ llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -333,7 +333,7 @@ let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo) } -class VOP3Pe_MAI op, VOPProfile P> : Enc64 { +class VOP3Pe_MAI op, VOPProfile P, bit acc_cd = 0> : Enc64 { bits<8> vdst; bits<10> src0; bits<10> src1; @@ -341,14 +341,13 @@ bits<3> blgp; bits<3> cbsz; bits<4> abid; - bits<1> clamp; let Inst{7-0} = vdst; let Inst{10-8} = !if(P.HasSrc1, cbsz, 0); let Inst{14-11} = !if(P.HasSrc1, abid, 0); - let Inst{15} = !if(P.HasClamp, clamp{0}, 0); + let Inst{15} = acc_cd; let Inst{22-16} = op; let Inst{31-23} = 0x1a7; //encoding @@ -628,8 +627,8 @@ string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = HasDPP; - let AssemblerPredicate = HasDPP; + let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); @@ -683,8 +682,8 @@ let Size = 8; let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", ""); - let SubtargetPredicate = HasDPP; - let AssemblerPredicate = HasDPP; + let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); + let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP); let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); Index: llvm/test/Analysis/CostModel/AMDGPU/fadd.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -2,6 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s ; ALL-LABEL: 'fadd_f32' ; ALL: estimated cost of 1 for {{.*}} fadd float @@ -13,7 +14,8 @@ } ; ALL-LABEL: 'fadd_v2f32' -; ALL: estimated cost of 2 for {{.*}} fadd <2 x float> +; NOPACKEDF32: estimated cost of 2 for {{.*}} fadd <2 x float> +; PACKEDF32: estimated cost of 1 for {{.*}} fadd <2 x float> define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fadd <2 x float> %vec, %b @@ -22,7 +24,10 @@ } ; ALL-LABEL: 'fadd_v3f32' -; ALL: estimated cost of 3 for {{.*}} fadd <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fadd <3 x float> +; PACKEDF32: estimated cost of 2 for {{.*}} fadd <3 x float> define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fadd <3 x float> %vec, %b @@ -31,7 +36,10 @@ } ; ALL-LABEL: 'fadd_v5f32' -; ALL: estimated cost of 5 for {{.*}} fadd <5 x float> +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fadd <5 x float> +; PACKEDF32: estimated cost of 3 for {{.*}} fadd <5 x float> define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fadd <5 x float> %vec, %b @@ -40,6 +48,7 @@ } ; ALL-LABEL: 'fadd_f64' +; GFX90A-FASTF64: estimated cost of 1 for {{.*}} fadd double ; FASTF64: estimated cost of 2 for {{.*}} fadd double ; SLOWF64: estimated cost of 4 for {{.*}} fadd double ; SIZEALL: estimated cost of 2 for {{.*}} fadd double @@ -51,6 +60,7 @@ } ; ALL-LABEL: 'fadd_v2f64' +; GFX90A-FASTF64: estimated cost of 2 for {{.*}} fadd <2 x double> ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double> ; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double> ; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double> @@ -62,6 +72,7 @@ } ; ALL-LABEL: 'fadd_v3f64' +; GFX90A-FASTF64: estimated cost of 3 for {{.*}} fadd <3 x double> ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double> ; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double> ; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double> Index: llvm/test/Analysis/CostModel/AMDGPU/fma.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -2,6 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF32,SLOWF16,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s ; ALL-LABEL: 'fma_f32' ; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32 @@ -16,7 +17,7 @@ ; ALL-LABEL: 'fma_v2f32' ; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32 -; FASTF32: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 +; PACKEDF32: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fma.v2f32 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32 define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr @@ -27,7 +28,7 @@ ; ALL-LABEL: 'fma_v3f32' ; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32 -; FASTF32: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 +; PACKEDF32: estimated cost of 4 for {{.*}} call <3 x float> @llvm.fma.v3f32 ; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32 define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr @@ -38,7 +39,7 @@ ; ALL-LABEL: 'fma_v5f32' ; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32 -; FASTF32: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 +; PACKEDF32: estimated cost of 6 for {{.*}} call <5 x float> @llvm.fma.v5f32 ; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32 define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr @@ -49,6 +50,7 @@ ; ALL-LABEL: 'fma_f64' ; SLOWF64: estimated cost of 4 for {{.*}} call double @llvm.fma.f64 +; GFX90A-FASTF64: estimated cost of 1 for {{.*}} call double @llvm.fma.f64 ; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 ; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64 define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 { @@ -60,6 +62,7 @@ ; ALL-LABEL: 'fma_v2f64' ; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64 +; GFX90A-FASTF64: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fma.v2f64 ; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 ; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64 define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 { Index: llvm/test/Analysis/CostModel/AMDGPU/fmul.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -2,6 +2,7 @@ ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s ; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s ; ALL-LABEL: 'fmul_f32' ; ALL: estimated cost of 1 for {{.*}} fmul float @@ -13,7 +14,8 @@ } ; ALL-LABEL: 'fmul_v2f32' -; ALL: estimated cost of 2 for {{.*}} fmul <2 x float> +; NOPACKEDF32: estimated cost of 2 for {{.*}} fmul <2 x float> +; PACKEDF32: estimated cost of 1 for {{.*}} fmul <2 x float> define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %add = fmul <2 x float> %vec, %b @@ -22,7 +24,10 @@ } ; ALL-LABEL: 'fmul_v3f32' -; ALL: estimated cost of 3 for {{.*}} fmul <3 x float> +; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening, +; and 3 when it is legal. +; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fmul <3 x float> +; PACKEDF32: estimated cost of 2 for {{.*}} fmul <3 x float> define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 { %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr %add = fmul <3 x float> %vec, %b @@ -31,7 +36,10 @@ } ; ALL-LABEL: 'fmul_v5f32' -; ALL: estimated cost of 5 for {{.*}} fmul <5 x float> +; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening, +; and 5 when it is legal. +; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fmul <5 x float> +; PACKEDF32: estimated cost of 3 for {{.*}} fmul <5 x float> define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 { %vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr %add = fmul <5 x float> %vec, %b @@ -40,6 +48,7 @@ } ; ALL-LABEL: 'fmul_f64' +; GFX90A-FASTF64: estimated cost of 1 for {{.*}} fmul double ; FASTF64: estimated cost of 2 for {{.*}} fmul double ; SLOWF64: estimated cost of 4 for {{.*}} fmul double ; SIZEALL: estimated cost of 2 for {{.*}} fmul double Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -0,0 +1,584 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A + +declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1) + +define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret void +} + +define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmin_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmax_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmax_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmin_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmin_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmin_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmax_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmax_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret void +} + +define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) { +; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret void +} + +define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret double %ret +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir @@ -19,7 +19,7 @@ ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -27,7 +27,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -35,7 +35,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -71,7 +71,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -79,7 +79,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -97,7 +97,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -125,7 +125,7 @@ ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -133,7 +133,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -141,7 +141,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -177,7 +177,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -185,7 +185,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -203,7 +203,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -241,7 +241,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -259,7 +259,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX9: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -277,7 +277,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -305,21 +305,21 @@ ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -343,21 +343,21 @@ ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -49,7 +49,7 @@ ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -57,7 +57,7 @@ ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -65,7 +65,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -73,7 +73,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -137,7 +137,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -155,7 +155,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -163,7 +163,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -171,7 +171,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -227,7 +227,7 @@ ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -235,7 +235,7 @@ ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -243,7 +243,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -251,7 +251,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -315,7 +315,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -333,7 +333,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -341,7 +341,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 @@ -349,7 +349,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -435,7 +435,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX7-FLAT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -453,7 +453,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX8: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -461,7 +461,7 @@ ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 @@ -469,7 +469,7 @@ ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], -4, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -523,28 +523,28 @@ ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -594,28 +594,28 @@ ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX8: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX10: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3 - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = COPY $vgpr4_vgpr5 @@ -666,7 +666,7 @@ ; GFX7-FLAT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX7-FLAT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX7-FLAT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr ; GFX8: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -675,7 +675,7 @@ ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX8: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY]] - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], [[REG_SEQUENCE]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr ; GFX9: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -684,7 +684,7 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -693,7 +693,7 @@ ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -756,7 +756,7 @@ ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX7-FLAT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX7-FLAT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 ; GFX8: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -775,7 +775,7 @@ ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX8: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX8: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX8: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 ; GFX9: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -784,7 +784,7 @@ ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095 ; GFX10: liveins: $sgpr0_sgpr1, $vgpr2, $vgpr3 @@ -793,7 +793,7 @@ ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX10: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir @@ -17,19 +17,19 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -51,17 +51,17 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 0) @@ -91,13 +91,13 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -113,7 +113,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -147,12 +147,12 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -167,7 +167,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -199,13 +199,13 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -221,7 +221,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -255,12 +255,12 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -275,7 +275,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -307,13 +307,13 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -329,7 +329,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -363,12 +363,12 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -383,7 +383,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -415,7 +415,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 @@ -431,7 +431,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -447,7 +447,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -481,7 +481,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -496,7 +496,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX9: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -511,7 +511,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) + ; GFX10: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -533,19 +533,19 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -567,17 +567,17 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 0) @@ -607,13 +607,13 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -629,7 +629,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -663,12 +663,12 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX9: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -683,7 +683,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) + ; GFX10: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8) %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir @@ -18,7 +18,7 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 @@ -35,13 +35,13 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -63,7 +63,7 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -78,12 +78,12 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 4, addrspace 1) @@ -113,7 +113,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 @@ -130,13 +130,13 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -170,7 +170,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -185,12 +185,12 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 @@ -222,7 +222,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 @@ -239,7 +239,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -255,7 +255,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -289,7 +289,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -304,7 +304,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -319,7 +319,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2048 @@ -351,7 +351,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 @@ -368,7 +368,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -384,7 +384,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -418,7 +418,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -433,7 +433,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -448,7 +448,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4095 @@ -480,7 +480,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 @@ -508,7 +508,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 @@ -524,7 +524,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 @@ -558,7 +558,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 4, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -584,7 +584,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX9: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -599,7 +599,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 4097 @@ -621,7 +621,7 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s64 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -638,13 +638,13 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -666,7 +666,7 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -681,12 +681,12 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_ATOMICRMW_ADD %0, %1 :: (load store seq_cst 8, addrspace 1) @@ -716,7 +716,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]] ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -733,7 +733,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 @@ -749,7 +749,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 @@ -783,7 +783,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %10:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) + ; GFX7: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst 8, addrspace 1) ; GFX6-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -798,7 +798,7 @@ ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -813,7 +813,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %10:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %10, %subreg.sub1 - ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) + ; GFX10: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 %2:vgpr(s64) = G_CONSTANT i64 4095 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-copy.mir @@ -17,12 +17,12 @@ ; WAVE64: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; WAVE64: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[COPY]] ; WAVE64: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; WAVE64: FLAT_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY1]], [[DEF]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; WAVE32-LABEL: name: copy ; WAVE32: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr2_sgpr3 ; WAVE32: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], [[DEF]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:sgpr(p1) = COPY $sgpr2_sgpr3 %1:vgpr(p1) = COPY %0 %2:vgpr(s32) = G_IMPLICIT_DEF @@ -46,7 +46,7 @@ ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec ; WAVE64: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec - ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; WAVE32-LABEL: name: copy_vcc_bank_sgpr_bank ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -55,7 +55,7 @@ ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec ; WAVE32: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec - ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -85,7 +85,7 @@ ; WAVE64: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_1]], implicit $exec ; WAVE64: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_1]], implicit $exec - ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; WAVE32-LABEL: name: copy_vcc_bank_sgpr_bank_2_uses ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -96,7 +96,7 @@ ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[COPY3]], implicit-def $scc ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec ; WAVE32: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_CNDMASK_B32_e64_]], 0, [[COPY1]], [[V_CMP_NE_U32_e64_]], implicit $exec - ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 @@ -124,14 +124,14 @@ ; WAVE64: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; WAVE64: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY $scc ; WAVE64: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[COPY3]], implicit $exec - ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; WAVE32-LABEL: name: copy_vcc_bank_scc_physreg ; WAVE32: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; WAVE32: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; WAVE32: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc ; WAVE32: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY1]], [[COPY3]], implicit $exec - ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD [[COPY]], [[V_CNDMASK_B32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s32) = COPY $vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum-ieee.mir @@ -24,9 +24,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec @@ -92,9 +92,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmaxnum.mir @@ -25,9 +25,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec @@ -91,9 +91,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MAX_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum-ieee.mir @@ -24,9 +24,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec @@ -92,9 +92,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fminnum.mir @@ -25,9 +25,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec @@ -91,9 +91,9 @@ ; GFX7: %7:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GFX7: %8:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GFX7: %9:vgpr_32 = nofpexcept V_MIN_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY3]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7: %10:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY4]], 0, [[COPY5]], 0, 0, implicit $mode, implicit $exec ; GFX7: %11:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; GFX7: %12:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[COPY5]], 0, [[COPY6]], 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir @@ -18,9 +18,9 @@ ; GCN: %4:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GCN: %5:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec ; GCN: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GCN: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 %2:vgpr(s32) = COPY $vgpr1 @@ -133,16 +133,16 @@ ; GCN: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GCN: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GCN: %15:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GCN: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(p1) = COPY $vgpr2_vgpr3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir @@ -18,16 +18,16 @@ ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GCN: %3:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; GCN: %4:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; GCN: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; VI-LABEL: name: fptoui ; VI: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; VI: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; VI: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; VI: %3:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec ; VI: %4:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; VI: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; VI: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; VI: FLAT_STORE_DWORD [[COPY2]], %3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; VI: FLAT_STORE_DWORD [[COPY2]], %4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -26,7 +26,7 @@ ; CHECK: %12:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %15:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %12, 0, 0, implicit $mode, implicit $exec ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %15, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 @@ -75,7 +75,7 @@ ; CHECK: %13:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec ; CHECK: %16:vreg_64 = nofpexcept V_FRACT_F64_e64 0, %13, 0, 0, implicit $mode, implicit $exec ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], %16, [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; CHECK: S_ENDPGM 0 %2:sgpr(p4) = COPY $sgpr0_sgpr1 %7:sgpr(s64) = G_CONSTANT i64 36 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-implicit-def.mir @@ -99,7 +99,7 @@ ; GCN-LABEL: name: implicit_def_p1_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:vgpr(p1) = G_IMPLICIT_DEF %1:vgpr(s32) = G_CONSTANT i32 4 G_STORE %1, %0 :: (store 4, addrspace 1) @@ -117,7 +117,7 @@ ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec ; GCN: $m0 = S_MOV_B32 -1 - ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:vgpr(p3) = G_IMPLICIT_DEF %1:vgpr(s32) = G_CONSTANT i32 4 G_STORE %1, %0 :: (store 4, addrspace 1) @@ -134,7 +134,7 @@ ; GCN-LABEL: name: implicit_def_p4_vgpr ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec - ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GCN: FLAT_STORE_DWORD [[DEF]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) %0:vgpr(p4) = G_IMPLICIT_DEF %1:vgpr(s32) = G_CONSTANT i32 4 G_STORE %1, %0 :: (store 4, addrspace 1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -16,12 +16,12 @@ ; GFX7-LABEL: name: load_atomic_flat_s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 0) @@ -97,12 +97,12 @@ ; GFX7-LABEL: name: load_atomic_flat_s64_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_atomic_flat_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) + ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 0) @@ -242,7 +242,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 @@ -257,7 +257,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -291,12 +291,12 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -23,7 +23,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 @@ -33,17 +33,17 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load seq_cst 4, align 4, addrspace 1) @@ -144,7 +144,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 @@ -154,17 +154,17 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load seq_cst 8, align 8, addrspace 1) @@ -349,7 +349,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 @@ -369,7 +369,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -384,12 +384,12 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -418,7 +418,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX7: liveins: $vgpr0_vgpr1 @@ -428,7 +428,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -443,12 +443,12 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, 0, 0, 0, implicit $exec :: (load seq_cst 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 @@ -487,7 +487,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 @@ -507,7 +507,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -522,12 +522,12 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load seq_cst 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -19,22 +19,22 @@ ; GFX7-LABEL: name: load_flat_s32_from_4 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_flat_s32_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_flat_s32_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX10-LABEL: name: load_flat_s32_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 0) @@ -56,22 +56,22 @@ ; GFX7-LABEL: name: load_flat_s32_from_2 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) + ; GFX7: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] ; GFX8-LABEL: name: load_flat_s32_from_2 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) + ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] ; GFX9-LABEL: name: load_flat_s32_from_2 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) + ; GFX9: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] ; GFX10-LABEL: name: load_flat_s32_from_2 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) + ; GFX10: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 0) @@ -93,22 +93,22 @@ ; GFX7-LABEL: name: load_flat_s32_from_1 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 0) @@ -129,19 +129,19 @@ ; GFX7-LABEL: name: load_flat_v2s32 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_flat_v2s32 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_flat_v2s32 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_flat_v2s32 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 0) @@ -163,22 +163,22 @@ ; GFX7-LABEL: name: load_flat_v3s32 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) + ; GFX7: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] ; GFX8-LABEL: name: load_flat_v3s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) + ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] ; GFX9-LABEL: name: load_flat_v3s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) + ; GFX9: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] ; GFX10-LABEL: name: load_flat_v3s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) + ; GFX10: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4) ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 0) @@ -200,22 +200,22 @@ ; GFX7-LABEL: name: load_flat_v4s32 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_flat_v4s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_flat_v4s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_flat_v4s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 0) @@ -237,22 +237,22 @@ ; GFX7-LABEL: name: load_flat_s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 0) @@ -274,22 +274,22 @@ ; GFX7-LABEL: name: load_flat_v2s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_flat_v2s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_flat_v2s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX9: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_flat_v2s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) + ; GFX10: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4) ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 0) @@ -422,22 +422,22 @@ ; GFX7-LABEL: name: load_flat_p3_from_4 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_flat_p3_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_flat_p3_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX10-LABEL: name: load_flat_p3_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 0) @@ -459,22 +459,22 @@ ; GFX7-LABEL: name: load_flat_p1_from_8 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_flat_p1_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_flat_p1_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_flat_p1_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 0) @@ -566,22 +566,22 @@ ; GFX7-LABEL: name: load_flat_v2s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_flat_v2s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_flat_v2s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX9: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX10-LABEL: name: load_flat_v2s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; GFX10: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 0) @@ -603,22 +603,22 @@ ; GFX7-LABEL: name: load_flat_v4s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_flat_v4s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_flat_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX9: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX9: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_flat_v4s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) + ; GFX10: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8) ; GFX10: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 0) @@ -728,7 +728,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1 @@ -743,12 +743,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1 @@ -763,7 +763,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2047 @@ -797,7 +797,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX8: liveins: $vgpr0_vgpr1 @@ -812,12 +812,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2048 ; GFX10: liveins: $vgpr0_vgpr1 @@ -832,7 +832,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2048 @@ -866,7 +866,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX8: liveins: $vgpr0_vgpr1 @@ -881,7 +881,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0_vgpr1 @@ -896,7 +896,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2047 ; GFX10: liveins: $vgpr0_vgpr1 @@ -911,7 +911,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2047 @@ -945,7 +945,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX8: liveins: $vgpr0_vgpr1 @@ -960,7 +960,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 @@ -975,7 +975,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 @@ -990,7 +990,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -1024,7 +1024,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1039,12 +1039,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1059,7 +1059,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 @@ -1093,7 +1093,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1108,7 +1108,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1123,7 +1123,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4096 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1138,7 +1138,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 @@ -1172,7 +1172,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1187,7 +1187,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1202,7 +1202,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4095 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1217,7 +1217,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4095 @@ -1251,7 +1251,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1266,7 +1266,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1281,7 +1281,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4096 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1296,7 +1296,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4096 @@ -1330,7 +1330,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1345,7 +1345,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1360,7 +1360,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8191 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1375,7 +1375,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 @@ -1409,7 +1409,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1424,7 +1424,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1439,7 +1439,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8192 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1454,7 +1454,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 @@ -1488,7 +1488,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1503,7 +1503,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1518,7 +1518,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8191 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1533,7 +1533,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 @@ -1567,7 +1567,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX7: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1582,7 +1582,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1597,7 +1597,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX9: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX9: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8192 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1612,7 +1612,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) + ; GFX10: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1) ; GFX10: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir @@ -16,13 +16,13 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(p1) = COPY %0 @@ -47,13 +47,13 @@ ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 @@ -81,13 +81,13 @@ ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_zext_vgpr ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 @@ -125,7 +125,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX9: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_merge_not_0_vgpr ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 @@ -141,7 +141,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec ; GFX10: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 @@ -169,7 +169,7 @@ ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], 4095, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095 ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 @@ -195,7 +195,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 @@ -225,7 +225,7 @@ ; GFX9: liveins: $sgpr0_sgpr1, $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[COPY1]], -4096, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096 ; GFX10: liveins: $sgpr0_sgpr1, $vgpr0 @@ -251,7 +251,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec ; GFX10: %14:vgpr_32, dead %16:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, %14, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:vgpr(s32) = COPY $vgpr0 @@ -279,13 +279,13 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4096 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4096 @@ -310,13 +310,13 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4097 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4097 @@ -351,7 +351,7 @@ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097 ; GFX10: liveins: $sgpr0_sgpr1 @@ -367,7 +367,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4097 @@ -392,13 +392,13 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2049, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_2049 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 1, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 2049 @@ -423,7 +423,7 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], -2049, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg2049 ; GFX10: liveins: $sgpr0_sgpr1 @@ -439,7 +439,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -2049 @@ -463,13 +463,13 @@ ; GFX9: liveins: $sgpr0_sgpr1 ; GFX9: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec - ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 4095, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967295 ; GFX10: liveins: $sgpr0_sgpr1 ; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1 ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec - ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[COPY]], [[V_MOV_B32_e32_]], 2047, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294967295 @@ -503,7 +503,7 @@ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294967296 ; GFX10: liveins: $sgpr0_sgpr1 @@ -519,7 +519,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294967296 @@ -554,7 +554,7 @@ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_4294971390 ; GFX10: liveins: $sgpr0_sgpr1 @@ -570,7 +570,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 4294971390 @@ -605,7 +605,7 @@ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967295 ; GFX10: liveins: $sgpr0_sgpr1 @@ -621,7 +621,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4294967295 @@ -655,7 +655,7 @@ ; GFX9: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX9: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4294967296 ; GFX10: liveins: $sgpr0_sgpr1 @@ -671,7 +671,7 @@ ; GFX10: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def $scc, implicit $scc ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 ; GFX10: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = COPY $sgpr0_sgpr1 %1:sgpr(s64) = G_CONSTANT i64 -4294967296 @@ -693,12 +693,12 @@ ; GFX9-LABEL: name: load_global_s32_from_copy_undef_sgpr ; GFX9: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_copy_undef_sgpr ; GFX10: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:sgpr(p1) = G_IMPLICIT_DEF %1:vgpr(p1) = COPY %0 @@ -717,11 +717,11 @@ bb.0: ; GFX9-LABEL: name: load_global_s32_from_undef_vgpr ; GFX9: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_undef_vgpr ; GFX10: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = G_IMPLICIT_DEF %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir @@ -27,7 +27,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_4 ; GFX7: liveins: $vgpr0_vgpr1 @@ -37,27 +37,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_global_s32_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_global_s32_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_s32_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 1) @@ -84,7 +84,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) + ; GFX6: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_2 ; GFX7: liveins: $vgpr0_vgpr1 @@ -94,27 +94,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) + ; GFX7: [[BUFFER_LOAD_USHORT_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_2 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] ; GFX8-LABEL: name: load_global_s32_from_2 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1) + ; GFX8: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 2, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_USHORT]] ; GFX9-LABEL: name: load_global_s32_from_2 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] ; GFX10-LABEL: name: load_global_s32_from_2 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_USHORT:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_USHORT]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 1) @@ -141,7 +141,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1 ; GFX7: liveins: $vgpr0_vgpr1 @@ -151,27 +151,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 1) @@ -198,7 +198,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX6: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-LABEL: name: load_global_v2s32 ; GFX7: liveins: $vgpr0_vgpr1 @@ -208,27 +208,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_v2s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_global_v2s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_global_v2s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_global_v2s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 1) @@ -255,7 +255,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX6: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] ; GFX7-LABEL: name: load_global_v4s32 ; GFX7: liveins: $vgpr0_vgpr1 @@ -265,27 +265,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORDX4_ADDR64_:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUFFER_LOAD_DWORDX4_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_v4s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_global_v4s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_global_v4s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_global_v4s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 1) @@ -312,27 +312,27 @@ ; GFX7-LABEL: name: load_global_s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX7-FLAT-LABEL: name: load_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_global_s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_global_s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_global_s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 1) @@ -359,27 +359,27 @@ ; GFX7-LABEL: name: load_global_v2s64 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX7-FLAT-LABEL: name: load_global_v2s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX8-LABEL: name: load_global_v2s64 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, align 4, addrspace 1) ; GFX8: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]] ; GFX9-LABEL: name: load_global_v2s64 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] ; GFX10-LABEL: name: load_global_v2s64 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 4, addrspace 1) ; GFX10: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[GLOBAL_LOAD_DWORDX4_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 1) @@ -500,27 +500,27 @@ ; GFX7-LABEL: name: load_global_p3_from_4 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX7-FLAT-LABEL: name: load_global_p3_from_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_global_p3_from_4 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_global_p3_from_4 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_p3_from_4 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 1) @@ -547,27 +547,27 @@ ; GFX7-LABEL: name: load_global_p1_from_8 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX7-FLAT-LABEL: name: load_global_p1_from_8 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_global_p1_from_8 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_global_p1_from_8 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_global_p1_from_8 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 1) @@ -688,27 +688,27 @@ ; GFX7-LABEL: name: load_global_v2s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX7: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX7-FLAT-LABEL: name: load_global_v2s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX8-LABEL: name: load_global_v2s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_DWORD]] ; GFX9-LABEL: name: load_global_v2s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] ; GFX10-LABEL: name: load_global_v2s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 1) @@ -735,27 +735,27 @@ ; GFX7-LABEL: name: load_global_v4s16 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX7-FLAT-LABEL: name: load_global_v4s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX8-LABEL: name: load_global_v4s16 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8, addrspace 1) ; GFX8: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; GFX9-LABEL: name: load_global_v4s16 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX9: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; GFX10-LABEL: name: load_global_v4s16 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; GFX10: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 1) @@ -833,7 +833,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX7: liveins: $vgpr0_vgpr1 @@ -843,7 +843,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -858,7 +858,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1 @@ -873,17 +873,17 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2047 @@ -912,7 +912,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX7: liveins: $vgpr0_vgpr1 @@ -922,7 +922,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -937,7 +937,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX8: liveins: $vgpr0_vgpr1 @@ -952,12 +952,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_2048 ; GFX10: liveins: $vgpr0_vgpr1 @@ -972,7 +972,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 2048 @@ -1011,7 +1011,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1031,7 +1031,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1046,7 +1046,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1061,17 +1061,17 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2047 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2047 @@ -1110,7 +1110,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1130,7 +1130,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1145,7 +1145,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1160,17 +1160,17 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m2048 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -2048 @@ -1199,7 +1199,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1209,7 +1209,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1224,7 +1224,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1239,12 +1239,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_4095 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1259,7 +1259,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4095 @@ -1289,7 +1289,7 @@ ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1300,7 +1300,7 @@ ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1315,7 +1315,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1330,7 +1330,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1345,7 +1345,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1360,7 +1360,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 4096 @@ -1399,7 +1399,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1419,7 +1419,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1434,7 +1434,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1449,12 +1449,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4095 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1469,7 +1469,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4095 @@ -1508,7 +1508,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1528,7 +1528,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1543,7 +1543,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1558,12 +1558,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[COPY]], -4096, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m4096 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1578,7 +1578,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -4096 @@ -1608,7 +1608,7 @@ ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1619,7 +1619,7 @@ ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8191 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1634,7 +1634,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1649,7 +1649,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1664,7 +1664,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1679,7 +1679,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8191 @@ -1709,7 +1709,7 @@ ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX6: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1720,7 +1720,7 @@ ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 ; GFX7: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 8192 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1735,7 +1735,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1750,7 +1750,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1765,7 +1765,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1780,7 +1780,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 8192 @@ -1819,7 +1819,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1839,7 +1839,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1854,7 +1854,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1869,7 +1869,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0_vgpr1 @@ -1884,7 +1884,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191 ; GFX10: liveins: $vgpr0_vgpr1 @@ -1899,7 +1899,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8191 @@ -1938,7 +1938,7 @@ ; GFX6: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX6: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7: liveins: $vgpr0_vgpr1 @@ -1958,7 +1958,7 @@ ; GFX7: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX7: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 @@ -1973,7 +1973,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX7-FLAT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX8: liveins: $vgpr0_vgpr1 @@ -1988,7 +1988,7 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) + ; GFX8: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 1, addrspace 1) ; GFX8: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]] ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0_vgpr1 @@ -2003,7 +2003,7 @@ ; GFX9: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX9: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX9: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192 ; GFX10: liveins: $vgpr0_vgpr1 @@ -2018,7 +2018,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 1) ; GFX10: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = G_CONSTANT i64 -8192 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7-FLAT %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s @@ -23,27 +24,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) + ; GFX7: [[BUFFER_LOAD_DWORDX3_ADDR64_:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) ; GFX7: $vgpr0_vgpr1_vgpr2 = COPY [[BUFFER_LOAD_DWORDX3_ADDR64_]] ; GFX7-FLAT-LABEL: name: load_global_v3s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX7-FLAT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1) + ; GFX7-FLAT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1) ; GFX7-FLAT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] ; GFX8-LABEL: name: load_global_v3s32 ; GFX8: liveins: $vgpr0_vgpr1 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1) + ; GFX8: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12, align 4, addrspace 1) ; GFX8: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]] ; GFX9-LABEL: name: load_global_v3s32 ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX9: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[GLOBAL_LOAD_DWORDX3_]] ; GFX10-LABEL: name: load_global_v3s32 ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX10: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) + ; GFX10: [[GLOBAL_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = GLOBAL_LOAD_DWORDX3 [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 12, align 4, addrspace 1) ; GFX10: $vgpr0_vgpr1_vgpr2 = COPY [[GLOBAL_LOAD_DWORDX3_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 1) @@ -134,4 +135,3 @@ $vgpr0_vgpr1_vgpr2 = COPY %1 ... - Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -19,12 +19,12 @@ ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -49,12 +49,12 @@ ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -79,12 +79,12 @@ ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -109,12 +109,12 @@ ; GFX6-LABEL: name: load_private_p3_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_p3_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -139,12 +139,12 @@ ; GFX6-LABEL: name: load_private_p5_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_p5_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -170,12 +170,12 @@ ; GFX6-LABEL: name: load_private_v2s16 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_v2s16 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -206,12 +206,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -240,14 +240,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -279,12 +279,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -313,14 +313,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -349,14 +349,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -385,12 +385,12 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -419,14 +419,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -455,14 +455,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -491,14 +491,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -527,14 +527,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -563,14 +563,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -599,14 +599,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -635,14 +635,14 @@ ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -666,10 +666,10 @@ bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -691,10 +691,10 @@ bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -716,10 +716,10 @@ bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -742,11 +742,11 @@ ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -770,10 +770,10 @@ bb.0: ; GFX6-LABEL: name: load_private_s32_from_fi - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_fi - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -797,10 +797,10 @@ bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -829,13 +829,13 @@ ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -861,11 +861,11 @@ ; GFX6-LABEL: name: load_private_s32_from_neg1 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_neg1 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 -1 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir @@ -18,16 +18,16 @@ ; WAVE64: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; WAVE64: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; WAVE64: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) - ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; WAVE64: FLAT_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; WAVE32-LABEL: name: sitofp ; WAVE32: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; WAVE32: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; WAVE32: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; WAVE32: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY]], 0, 0, implicit $mode, implicit $exec ; WAVE32: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) - ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; WAVE32: GLOBAL_STORE_DWORD [[COPY2]], [[V_CVT_F32_I32_e64_1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:sgpr(s32) = COPY $sgpr0 %1:vgpr(s32) = COPY $vgpr0 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir @@ -17,12 +17,12 @@ ; GFX7: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4) + ; GFX7: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4) ; GFX9-LABEL: name: atomic_store_flat_s32_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4) + ; GFX9: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 4) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p0) = COPY $vgpr1_vgpr2 G_STORE %0, %1 :: (store seq_cst 4, align 4, addrspace 0) @@ -152,12 +152,12 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8) ; GFX9-LABEL: name: atomic_store_flat_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst 8) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = COPY $vgpr2_vgpr3 G_STORE %0, %1 :: (store seq_cst 8, align 8, addrspace 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -19,22 +19,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX8-LABEL: name: store_flat_s32_to_4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX9-LABEL: name: store_flat_s32_to_4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX10-LABEL: name: store_flat_s32_to_4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 0) @@ -55,22 +55,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) + ; GFX7: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) ; GFX8-LABEL: name: store_flat_s32_to_2 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) + ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) ; GFX9-LABEL: name: store_flat_s32_to_2 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) + ; GFX9: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) ; GFX10-LABEL: name: store_flat_s32_to_2 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) + ; GFX10: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 2, align 2, addrspace 0) @@ -91,22 +91,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) + ; GFX7: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) ; GFX8-LABEL: name: store_flat_s32_to_1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) + ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) ; GFX9-LABEL: name: store_flat_s32_to_1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) + ; GFX9: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) ; GFX10-LABEL: name: store_flat_s32_to_1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) + ; GFX10: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 1, align 1, addrspace 0) @@ -128,22 +128,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX8-LABEL: name: store_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX9-LABEL: name: store_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX10-LABEL: name: store_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 0) @@ -237,22 +237,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX8-LABEL: name: store_flat_v2s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX9-LABEL: name: store_flat_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX10-LABEL: name: store_flat_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 0) @@ -273,22 +273,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX7: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) + ; GFX7: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) ; GFX8-LABEL: name: store_flat_v3s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) + ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) ; GFX9-LABEL: name: store_flat_v3s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX9: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) + ; GFX9: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) ; GFX10-LABEL: name: store_flat_v3s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX10: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) + ; GFX10: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 G_STORE %1, %0 :: (store 12, align 16, addrspace 0) @@ -309,22 +309,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX8-LABEL: name: store_flat_v4s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX9-LABEL: name: store_flat_v4s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX10-LABEL: name: store_flat_v4s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 16, addrspace 0) @@ -346,22 +346,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX8-LABEL: name: store_flat_v2s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX9-LABEL: name: store_flat_v2s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX10-LABEL: name: store_flat_v2s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 0) @@ -383,22 +383,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX8-LABEL: name: store_flat_v4s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX9-LABEL: name: store_flat_v4s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX10-LABEL: name: store_flat_v4s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 0) @@ -493,22 +493,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX8-LABEL: name: store_flat_v2s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX9-LABEL: name: store_flat_v2s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX9: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) ; GFX10-LABEL: name: store_flat_v2s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) + ; GFX10: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 16, addrspace 0) @@ -530,22 +530,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX8-LABEL: name: store_flat_p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX9-LABEL: name: store_flat_p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) ; GFX10-LABEL: name: store_flat_p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) + ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 0) @@ -604,22 +604,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX8-LABEL: name: store_flat_p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX9-LABEL: name: store_flat_p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX10-LABEL: name: store_flat_p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 0) @@ -677,22 +677,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) ; GFX8-LABEL: name: store_atomic_flat_s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) ; GFX9-LABEL: name: store_atomic_flat_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) + ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) ; GFX10-LABEL: name: store_atomic_flat_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) + ; GFX10: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store monotonic 4, align 4, addrspace 0) @@ -714,22 +714,22 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) ; GFX8-LABEL: name: store_atomic_flat_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) ; GFX9-LABEL: name: store_atomic_flat_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) + ; GFX9: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) ; GFX10-LABEL: name: store_atomic_flat_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) + ; GFX10: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store monotonic 8, align 8, addrspace 0) @@ -761,7 +761,7 @@ ; GFX7: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX7: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX8-LABEL: name: store_flat_s32_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -776,12 +776,12 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX9-LABEL: name: store_flat_s32_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX9: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) ; GFX10-LABEL: name: store_flat_s32_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -796,7 +796,7 @@ ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX10: %9:vgpr_32, dead %11:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX10: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX10: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + ; GFX10: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir @@ -26,7 +26,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX7-LABEL: name: store_global_s32_to_4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -36,27 +36,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_s32_to_4 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX8-LABEL: name: store_global_s32_to_4 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX9-LABEL: name: store_global_s32_to_4 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX10-LABEL: name: store_global_s32_to_4 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 1) @@ -82,7 +82,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) + ; GFX6: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) ; GFX7-LABEL: name: store_global_s32_to_2 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -92,27 +92,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) + ; GFX7: BUFFER_STORE_SHORT_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_s32_to_2 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1) ; GFX8-LABEL: name: store_global_s32_to_2 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1) + ; GFX8: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2, addrspace 1) ; GFX9-LABEL: name: store_global_s32_to_2 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) + ; GFX9: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) ; GFX10-LABEL: name: store_global_s32_to_2 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) + ; GFX10: GLOBAL_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 2, align 2, addrspace 1) @@ -138,7 +138,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) + ; GFX6: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) ; GFX7-LABEL: name: store_global_s32_to_1 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -148,27 +148,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) + ; GFX7: BUFFER_STORE_BYTE_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_s32_to_1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1) ; GFX8-LABEL: name: store_global_s32_to_1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1) + ; GFX8: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 1, addrspace 1) ; GFX9-LABEL: name: store_global_s32_to_1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) + ; GFX9: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) ; GFX10-LABEL: name: store_global_s32_to_1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) + ; GFX10: GLOBAL_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store 1, align 1, addrspace 1) @@ -195,27 +195,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX8-LABEL: name: store_global_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX9-LABEL: name: store_global_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX10-LABEL: name: store_global_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 1) @@ -288,7 +288,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX6: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX7-LABEL: name: store_global_v2s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -298,27 +298,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX7: BUFFER_STORE_DWORDX2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v2s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX8-LABEL: name: store_global_v2s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX9-LABEL: name: store_global_v2s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX10-LABEL: name: store_global_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 1) @@ -344,7 +344,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX6: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; GFX7-LABEL: name: store_global_v4s32 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -354,27 +354,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX7: BUFFER_STORE_DWORDX4_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v4s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) ; GFX8-LABEL: name: store_global_v4s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) ; GFX9-LABEL: name: store_global_v4s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; GFX10-LABEL: name: store_global_v4s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 16, addrspace 1) @@ -401,27 +401,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v2s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX8-LABEL: name: store_global_v2s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX9-LABEL: name: store_global_v2s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX10-LABEL: name: store_global_v2s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s16>) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 1) @@ -448,27 +448,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v4s16 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX8-LABEL: name: store_global_v4s16 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX9-LABEL: name: store_global_v4s16 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX10-LABEL: name: store_global_v4s16 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 1) @@ -542,27 +542,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) + ; GFX7: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v2s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) ; GFX8-LABEL: name: store_global_v2s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16, addrspace 1) ; GFX9-LABEL: name: store_global_v2s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; GFX10-LABEL: name: store_global_v2s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5 G_STORE %1, %0 :: (store 16, align 16, addrspace 1) @@ -589,27 +589,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_p1 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX8-LABEL: name: store_global_p1 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8, addrspace 1) ; GFX9-LABEL: name: store_global_p1 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; GFX10-LABEL: name: store_global_p1 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p1) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store 8, align 8, addrspace 1) @@ -683,27 +683,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_p3 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX8-LABEL: name: store_global_p3 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX9-LABEL: name: store_global_p3 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX10-LABEL: name: store_global_p3 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %1, %0 :: (store 4, align 4, addrspace 1) @@ -776,27 +776,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) + ; GFX7: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) ; GFX7-FLAT-LABEL: name: store_atomic_global_s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) ; GFX8-LABEL: name: store_atomic_global_s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) + ; GFX8: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 4, addrspace 1) ; GFX9-LABEL: name: store_atomic_global_s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1) ; GFX10-LABEL: name: store_atomic_global_s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 G_STORE %1, %0 :: (store monotonic 4, align 4, addrspace 1) @@ -823,27 +823,27 @@ ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) + ; GFX7: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) ; GFX7-FLAT-LABEL: name: store_atomic_global_s64 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) ; GFX8-LABEL: name: store_atomic_global_s64 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic 8, addrspace 1) ; GFX9-LABEL: name: store_atomic_global_s64 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1) ; GFX10-LABEL: name: store_atomic_global_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 - ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store monotonic 8, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s64) = COPY $vgpr2_vgpr3 G_STORE %1, %0 :: (store monotonic 8, align 8, addrspace 1) @@ -870,7 +870,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX6: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX6: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX7-LABEL: name: store_global_s32_gep_2047 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -880,7 +880,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX7: BUFFER_STORE_DWORD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_s32_gep_2047 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -895,7 +895,7 @@ ; GFX7-FLAT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX7-FLAT: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX7-FLAT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX7-FLAT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX8-LABEL: name: store_global_s32_gep_2047 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -910,17 +910,17 @@ ; GFX8: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec ; GFX8: %9:vgpr_32, dead %11:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %9, %subreg.sub1 - ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; GFX8: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; GFX9-LABEL: name: store_global_s32_gep_2047 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GFX10-LABEL: name: store_global_s32_gep_2047 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(s32) = COPY $vgpr2 %2:vgpr(s64) = G_CONSTANT i64 2047 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir @@ -26,27 +26,27 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 ; GFX7: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3 - ; GFX7: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) + ; GFX7: BUFFER_STORE_DWORDX3_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) ; GFX7-FLAT-LABEL: name: store_global_v3s32 ; GFX7-FLAT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX7-FLAT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-FLAT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX7-FLAT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1) + ; GFX7-FLAT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1) ; GFX8-LABEL: name: store_global_v3s32 ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX8: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1) + ; GFX8: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12, align 16, addrspace 1) ; GFX9-LABEL: name: store_global_v3s32 ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX9: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) + ; GFX9: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) ; GFX10-LABEL: name: store_global_v3s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4 - ; GFX10: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) + ; GFX10: GLOBAL_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (store 12, align 16, addrspace 1) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4 G_STORE %1, %0 :: (store 12, align 16, addrspace 1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -21,12 +21,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -52,12 +52,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -83,12 +83,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -114,12 +114,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: function_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -145,12 +145,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: function_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -176,12 +176,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: function_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -206,10 +206,10 @@ ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -236,10 +236,10 @@ ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -265,11 +265,11 @@ ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -294,12 +294,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -324,12 +324,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -354,12 +354,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -384,12 +384,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -414,12 +414,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -444,12 +444,12 @@ ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -475,11 +475,11 @@ ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_PTR_ADD %0, %1 @@ -507,11 +507,11 @@ ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4095 ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -538,12 +538,12 @@ ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096 ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -1,10 +1,21 @@ +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) #0 +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) ; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: +; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc define float @global_atomic_fadd_f32_rtn(float addrspace(1)* %ptr, float %data) { %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret float %ret } + +; GFX90A-LABEL: {{^}}global_atomic_fadd_v2f16_rtn: +; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +define <2 x half> @global_atomic_fadd_v2f16_rtn(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + ret <2 x half> %ret +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32: @@ -8,6 +9,13 @@ ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void } @@ -19,6 +27,13 @@ ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_f32_off_2048: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void @@ -31,6 +46,13 @@ ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_f32_off_neg2047: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void @@ -47,6 +69,16 @@ ; GFX908-NEXT: v_mov_b32_e32 v1, s1 ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2048 ; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc +; GFX90A-NEXT: s_endpgm %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data) ret void @@ -59,6 +91,13 @@ ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_v2f16: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void } @@ -70,6 +109,13 @@ ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2044 ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: global_atomic_fadd_v2f16_off_neg2047: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2044 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd-with-ret.ll @@ -0,0 +1,25 @@ +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s +; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32 immarg) + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: +; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 %soffset) { +main_body: + %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + store float %ret, float* undef + ret void +} + +; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: +; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}} offen glc +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +main_body: + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + store <2 x half> %ret, <2 x half>* undef + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll @@ -1,185 +1,329 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908 +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A ; Natural mapping define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) ret void } define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_4095__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void } ; Natural mapping, no voffset define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: successors: %bb.2(0x80000000) + ; GFX908: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX908: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX908: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX908: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX908: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX908: bb.2: + ; GFX908: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; GFX908: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX908: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX908: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; GFX908: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX908: bb.3: + ; GFX908: successors: %bb.4(0x80000000) + ; GFX908: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX908: bb.4: + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: successors: %bb.2(0x80000000) + ; GFX90A: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX90A: bb.2: + ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; GFX90A: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX90A: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX90A: bb.3: + ; GFX90A: successors: %bb.4(0x80000000) + ; GFX90A: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX90A: bb.4: + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } ; All operands need regbank legalization, no voffset define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: successors: %bb.2(0x80000000) + ; GFX908: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX908: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX908: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX908: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX908: bb.2: + ; GFX908: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; GFX908: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX908: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX908: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX908: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec + ; GFX908: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX908: bb.3: + ; GFX908: successors: %bb.4(0x80000000) + ; GFX908: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX908: bb.4: + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: successors: %bb.2(0x80000000) + ; GFX90A: liveins: $sgpr2, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX90A: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX90A: bb.2: + ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY7]], implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec + ; GFX90A: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX90A: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec + ; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX90A: bb.3: + ; GFX90A: successors: %bb.4(0x80000000) + ; GFX90A: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX90A: bb.4: + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(float %val, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095 + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -187,54 +331,92 @@ ; Natural mapping + slc define amdgpu_ps void @raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: raw_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) ret void } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll @@ -14,7 +14,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: raw_buffer_load_format_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -27,7 +27,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.format.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -45,7 +45,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -58,7 +58,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -93,7 +93,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED: $vgpr0 = COPY [[COPY6]] @@ -109,7 +109,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 @@ -169,7 +169,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -209,7 +209,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -234,7 +234,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED: $vgpr0 = COPY [[COPY6]] @@ -250,7 +250,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource" + 4095, align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll @@ -13,7 +13,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -31,7 +31,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY6]] @@ -52,7 +52,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -75,7 +75,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -121,7 +121,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -146,7 +146,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll @@ -14,7 +14,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -34,7 +34,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr7 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -69,7 +69,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -114,7 +114,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -140,7 +140,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) @@ -159,7 +159,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) @@ -178,7 +178,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) @@ -197,7 +197,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) @@ -216,7 +216,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) @@ -235,7 +235,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) @@ -254,7 +254,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY6]] @@ -275,7 +275,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX3_OFFEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFEN]].sub2 @@ -298,7 +298,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -323,7 +323,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -341,7 +341,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -365,7 +365,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY6]] @@ -386,7 +386,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) + ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -406,7 +406,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) + ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) ; CHECK: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec ; CHECK: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -444,7 +444,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -486,7 +486,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) + ; CHECK: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -512,7 +512,7 @@ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -529,7 +529,7 @@ ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[COPY4]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) @@ -548,7 +548,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE]], [[COPY4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) @@ -566,7 +566,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 16, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 16, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 16 @@ -585,7 +585,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -607,7 +607,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %10:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4096 @@ -626,7 +626,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) @@ -644,7 +644,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) @@ -664,7 +664,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 16 @@ -685,7 +685,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -706,7 +706,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -744,7 +744,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -790,7 +790,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %13, [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 5000, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %13, [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -14,7 +14,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -44,7 +44,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -56,7 +56,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -78,7 +78,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -91,7 +91,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,7 +116,7 @@ ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -131,7 +131,7 @@ ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -173,7 +173,7 @@ ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -211,7 +211,7 @@ ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -240,7 +240,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; PACKED: bb.1 (%ir-block.0): @@ -253,7 +253,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -275,7 +275,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; PACKED: bb.1 (%ir-block.0): @@ -288,7 +288,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -310,7 +310,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; PACKED: bb.1 (%ir-block.0): @@ -323,7 +323,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -346,7 +346,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): @@ -359,7 +359,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -385,7 +385,7 @@ ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -401,7 +401,7 @@ ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -448,7 +448,7 @@ ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -489,7 +489,7 @@ ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -14,7 +14,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -31,7 +31,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -51,7 +51,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -72,7 +72,7 @@ ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -94,7 +94,7 @@ ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -132,7 +132,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -159,7 +159,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -179,7 +179,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -199,7 +199,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 16, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 16, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -220,7 +220,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -244,7 +244,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -288,7 +288,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -15,7 +15,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -36,7 +36,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -71,7 +71,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -103,7 +103,7 @@ ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -148,7 +148,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -173,7 +173,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -191,7 +191,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -209,7 +209,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -227,7 +227,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -245,7 +245,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -263,7 +263,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -281,7 +281,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -301,7 +301,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -322,7 +322,7 @@ ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -344,7 +344,7 @@ ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -362,7 +362,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -381,7 +381,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -400,7 +400,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -418,7 +418,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -438,7 +438,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -474,7 +474,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -498,7 +498,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -517,7 +517,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -535,7 +535,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -554,7 +554,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -576,7 +576,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -595,7 +595,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -613,7 +613,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -631,7 +631,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -650,7 +650,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -672,7 +672,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %15:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -711,7 +711,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %13, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %13, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -755,7 +755,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY6]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll @@ -13,7 +13,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset @@ -26,7 +26,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -44,7 +44,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_OFFEN]].sub1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -68,7 +68,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.raw.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -92,7 +92,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub0 ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub1 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_OFFEN]].sub2 @@ -126,7 +126,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub0 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_OFFEN]].sub1 ; PACKED: $vgpr0 = COPY [[COPY6]] @@ -167,7 +167,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -207,7 +207,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -232,7 +232,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc @@ -245,7 +245,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -263,7 +263,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc @@ -276,7 +276,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -294,7 +294,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc @@ -307,7 +307,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -325,7 +325,7 @@ ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_OFFEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: raw_tbuffer_load_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc @@ -338,7 +338,7 @@ ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_OFFEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.raw.tbuffer.load.f16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll @@ -12,7 +12,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -30,7 +30,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XY_OFFEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY6]] @@ -51,7 +51,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFEN]].sub2 @@ -74,7 +74,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub0 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub1 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFEN]].sub2 @@ -119,7 +119,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY5]], implicit $exec ; CHECK: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY6]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -144,7 +144,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) @@ -162,7 +162,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) @@ -180,7 +180,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) @@ -198,7 +198,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_OFFEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_OFFEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.raw.tbuffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll @@ -14,7 +14,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -49,7 +49,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_v2f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -62,7 +62,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -93,7 +93,7 @@ ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY1]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_v4f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -108,7 +108,7 @@ ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -143,7 +143,7 @@ ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -179,7 +179,7 @@ ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -224,7 +224,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -263,7 +263,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -309,7 +309,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -349,7 +349,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -374,7 +374,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_glc ; PACKED: bb.1 (%ir-block.0): @@ -387,7 +387,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -405,7 +405,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc ; PACKED: bb.1 (%ir-block.0): @@ -418,7 +418,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -436,7 +436,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc_glc ; PACKED: bb.1 (%ir-block.0): @@ -449,7 +449,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -467,7 +467,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_D16_X_gfx80_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_f16__sgpr_rsrc__vgpr_voffset__sgpr_soffset_dlc ; PACKED: bb.1 (%ir-block.0): @@ -480,7 +480,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll @@ -14,7 +14,7 @@ ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_tbuffer_store_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.i8(i8 %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -62,7 +62,7 @@ ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -98,7 +98,7 @@ ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -143,7 +143,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -182,7 +182,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -228,7 +228,7 @@ ; UNPACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; UNPACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -268,7 +268,7 @@ ; PACKED: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; PACKED: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll @@ -14,7 +14,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -35,7 +35,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE]], [[COPY6]], [[REG_SEQUENCE1]], [[COPY7]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -57,7 +57,7 @@ ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE]], [[COPY7]], [[REG_SEQUENCE1]], [[COPY8]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -80,7 +80,7 @@ ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE]], [[COPY8]], [[REG_SEQUENCE1]], [[COPY9]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 0) ret void @@ -100,7 +100,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr7 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) ret void @@ -135,7 +135,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 94, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -180,7 +180,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -226,7 +226,7 @@ ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U32_e64_]], [[S_AND_B32_]], implicit-def $scc - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY7]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -252,7 +252,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 1) ret void @@ -271,7 +271,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 2) ret void @@ -290,7 +290,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 1, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 3) ret void @@ -309,7 +309,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 78, i32 4) ret void @@ -328,7 +328,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 94, i32 0) ret void @@ -345,7 +345,7 @@ ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 94, i32 0) ret void @@ -364,7 +364,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY6]], [[REG_SEQUENCE]], [[COPY5]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 94, i32 0) ret void @@ -382,7 +382,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 16, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -401,7 +401,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -423,7 +423,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE]], [[COPY6]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset = add i32 %voffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -442,7 +442,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 94, i32 0) ret void @@ -460,7 +460,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 94, i32 0) ret void @@ -480,7 +480,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %soffset = add i32 %soffset.base, 16 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -501,7 +501,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %soffset = add i32 %soffset.base, 4095 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -522,7 +522,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY6]], [[S_MOV_B32_]], implicit-def $scc - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %soffset = add i32 %soffset.base, 4096 call void @llvm.amdgcn.raw.tbuffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 94, i32 0) @@ -560,7 +560,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[COPY5]], [[REG_SEQUENCE3]], [[S_ADD_I32_]], 0, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -606,7 +606,7 @@ ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) + ; CHECK: TBUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY]], [[V_ADD_U32_e64_]], [[REG_SEQUENCE3]], [[COPY6]], 904, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource" + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -1491,7 +1491,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset @@ -1504,7 +1504,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset @@ -1517,7 +1517,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %soffset, i32 0) @@ -1535,7 +1535,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX6: $vgpr0 = COPY [[COPY5]] @@ -1551,7 +1551,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX7: $vgpr0 = COPY [[COPY5]] @@ -1567,7 +1567,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX2_OFFEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 8, align 4) ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFEN]].sub1 ; GFX8: $vgpr0 = COPY [[COPY5]] @@ -1588,7 +1588,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX6: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX6: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] @@ -1611,7 +1611,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX7: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX7: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] @@ -1634,7 +1634,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF ; GFX8: [[COPY5:%[0-9]+]]:vreg_128 = COPY [[DEF]] ; GFX8: [[COPY6:%[0-9]+]]:vreg_128 = COPY [[DEF]] @@ -1662,7 +1662,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX6: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1682,7 +1682,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX7: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1702,7 +1702,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GFX8: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 @@ -1727,8 +1727,8 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1757,8 +1757,8 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1787,8 +1787,8 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1822,10 +1822,10 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1870,10 +1870,10 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1918,10 +1918,10 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -1971,7 +1971,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -1984,7 +1984,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4092 @@ -1997,7 +1997,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4092 @@ -2016,7 +2016,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -2029,7 +2029,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4095 @@ -2042,7 +2042,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4095 @@ -2061,7 +2061,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2074,7 +2074,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_vgpr_offset_add_4096 @@ -2087,7 +2087,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %soffset = add i32 %soffset.base, 4096 @@ -2107,8 +2107,8 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2137,8 +2137,8 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2167,8 +2167,8 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2204,8 +2204,8 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2234,8 +2234,8 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4068 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2264,8 +2264,8 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_256 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2300,10 +2300,10 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2348,10 +2348,10 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2396,10 +2396,10 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2450,10 +2450,10 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX6: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2498,10 +2498,10 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4036 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX7: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2546,10 +2546,10 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4032, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4048, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 48, align 4) ; GFX8: [[REG_SEQUENCE1:%[0-9]+]]:vreg_512 = REG_SEQUENCE [[BUFFER_LOAD_DWORDX4_OFFEN]], %subreg.sub0_sub1_sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]], %subreg.sub4_sub5_sub6_sub7, [[BUFFER_LOAD_DWORDX4_OFFEN2]], %subreg.sub8_sub9_sub10_sub11, [[BUFFER_LOAD_DWORDX4_OFFEN3]], %subreg.sub12_sub13_sub14_sub15 ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 @@ -2617,7 +2617,7 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2652,7 +2652,7 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2687,7 +2687,7 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2726,7 +2726,7 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2759,7 +2759,7 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2792,7 +2792,7 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4092, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2836,7 +2836,7 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2873,7 +2873,7 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2910,7 +2910,7 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2950,7 +2950,7 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -2983,7 +2983,7 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3016,7 +3016,7 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4095, align 1) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3057,7 +3057,7 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3092,7 +3092,7 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3125,7 +3125,7 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4096) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4 + 4096) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3165,8 +3165,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3215,8 +3215,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3265,8 +3265,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[COPY4]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3327,8 +3327,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3381,8 +3381,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3435,8 +3435,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3495,8 +3495,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3549,8 +3549,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3603,8 +3603,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY5]], [[REG_SEQUENCE3]], [[S_MOV_B32_1]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3660,8 +3660,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3711,8 +3711,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3762,8 +3762,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 936, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 952, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 936, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 952, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3819,8 +3819,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3870,8 +3870,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3921,8 +3921,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -3978,8 +3978,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4029,8 +4029,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4080,8 +4080,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY6]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY4]], [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4136,8 +4136,8 @@ ; GFX6: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX6: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX6: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) - ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX6: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) ; GFX6: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX6: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX6: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4186,8 +4186,8 @@ ; GFX7: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX7: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX7: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) - ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX7: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) ; GFX7: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX7: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX7: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4236,8 +4236,8 @@ ; GFX8: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY5]], implicit $exec ; GFX8: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; GFX8: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) - ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4064, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) + ; GFX8: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE3]], [[S_MOV_B32_]], 4080, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16 + 4064, align 4) ; GFX8: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; GFX8: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -4277,7 +4277,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr @@ -4290,7 +4290,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr @@ -4303,7 +4303,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.v, %offset.s @@ -4322,7 +4322,7 @@ ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr @@ -4335,7 +4335,7 @@ ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr @@ -4348,7 +4348,7 @@ ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset = add i32 %offset.s, %offset.v @@ -4370,7 +4370,7 @@ ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX6: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -4386,7 +4386,7 @@ ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX7: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_vgpr_sgpr_imm @@ -4402,7 +4402,7 @@ ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, %offset.s @@ -4425,7 +4425,7 @@ ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX6: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -4441,7 +4441,7 @@ ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX7: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_sgpr_vgpr_imm @@ -4457,7 +4457,7 @@ ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8: %9:vgpr_32, dead %17:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %9, [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, %offset.v @@ -4480,7 +4480,7 @@ ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -4495,7 +4495,7 @@ ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX7: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -4510,7 +4510,7 @@ ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX8: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 @@ -4533,7 +4533,7 @@ ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX6: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -4549,7 +4549,7 @@ ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX7: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX7: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX7: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX7: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -4565,7 +4565,7 @@ ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX8: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GFX8: %10:vgpr_32, dead %16:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + ; GFX8: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %10, [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) ; GFX8: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd-with-ret.ll @@ -1,11 +1,25 @@ +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s ; RUN: not --crash llc -global-isel < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 -declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #0 +declare float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) ; GFX908: error: {{.*}} return versions of fp atomics not supported -define amdgpu_ps float @buffer_atomic_add_f32_rtn(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_rtn: +; GFX90A: buffer_atomic_add_f32 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc +define amdgpu_kernel void @buffer_atomic_add_f32_rtn(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { main_body: %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) - ret float %ret + store float %ret, float* undef + ret void +} + +; GFX90A-LABEL: {{^}}buffer_atomic_add_v2f16_rtn: +; GFX90A: buffer_atomic_pk_add_f16 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}], s{{[0-9]+}} idxen offen glc +define amdgpu_kernel void @buffer_atomic_add_v2f16_rtn(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) { +main_body: + %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + store <2 x half> %ret, <2 x half>* undef + ret void } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll @@ -1,255 +1,453 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX908 +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX90A ; Natural mapping define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX908: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX908: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset_plus4095__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset.add, i32 %soffset, i32 0) ret void } define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__4095_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 4095, align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 4095, i32 %soffset, i32 0) ret void } ; Natural mapping, no voffset define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } ; All register operands need legalization define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] - ; CHECK: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: successors: %bb.2(0x80000000) + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX908: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX908: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX908: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX908: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX908: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX908: bb.2: + ; GFX908: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec + ; GFX908: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX908: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX908: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX908: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec + ; GFX908: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX908: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; GFX908: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX908: bb.3: + ; GFX908: successors: %bb.4(0x80000000) + ; GFX908: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX908: bb.4: + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__sgpr_voffset__vgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: successors: %bb.2(0x80000000) + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX90A: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX90A: [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX90A: bb.2: + ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY11]], implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY12]], implicit $exec + ; GFX90A: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX90A: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec + ; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX90A: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX90A: bb.3: + ; GFX90A: successors: %bb.4(0x80000000) + ; GFX90A: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX90A: bb.4: + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } ; All register operands need legalization, no voffset define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset(float inreg %val, <4 x i32> %rsrc, i32 inreg %vindex, i32 %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: successors: %bb.2(0x80000000) - ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; CHECK: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 - ; CHECK: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec - ; CHECK: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec - ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 - ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec - ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc - ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec - ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: successors: %bb.2(0x80000000) + ; GFX908: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX908: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX908: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX908: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX908: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX908: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX908: bb.2: + ; GFX908: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX908: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; GFX908: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; GFX908: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX908: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; GFX908: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX908: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX908: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX908: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; GFX908: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX908: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX908: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX908: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX908: bb.3: + ; GFX908: successors: %bb.4(0x80000000) + ; GFX908: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX908: bb.4: + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__sgpr_val__vgpr_rsrc__0_voffset__vgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: successors: %bb.2(0x80000000) + ; GFX90A: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX90A: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX90A: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 + ; GFX90A: [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub2_sub3 + ; GFX90A: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; GFX90A: bb.2: + ; GFX90A: successors: %bb.3(0x40000000), %bb.2(0x40000000) + ; GFX90A: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE1]], [[COPY9]], implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub0, implicit $exec + ; GFX90A: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]].sub1, implicit $exec + ; GFX90A: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1 + ; GFX90A: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY10]], implicit $exec + ; GFX90A: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc + ; GFX90A: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; GFX90A: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; GFX90A: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec + ; GFX90A: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY7]], [[COPY8]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; GFX90A: S_CBRANCH_EXECNZ %bb.2, implicit $exec + ; GFX90A: bb.3: + ; GFX90A: successors: %bb.4(0x80000000) + ; GFX90A: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; GFX90A: bb.4: + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } ; Natural mapping + slc define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX908: BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2) ret void } define amdgpu_ps void @struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_f32_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset_slc + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call float @llvm.amdgcn.struct.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 2) ret void } define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX908: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX908: BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 + ; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void } define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 inreg %soffset) { - ; CHECK-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset - ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 - ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 - ; CHECK: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 - ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 - ; CHECK: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; CHECK: S_ENDPGM 0 + ; GFX908-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX908: bb.1 (%ir-block.0): + ; GFX908: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX908: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX908: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX908: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX908: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX908: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX908: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX908: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX908: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX908: BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX908: S_ENDPGM 0 + ; GFX90A-LABEL: name: struct_buffer_atomic_add_v2f16_noret__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset + ; GFX90A: bb.1 (%ir-block.0): + ; GFX90A: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1 + ; GFX90A: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX90A: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GFX90A: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GFX90A: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GFX90A: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GFX90A: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX90A: [[BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_PK_ADD_F16_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GFX90A: S_ENDPGM 0 %ret = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0) ret void } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll @@ -15,7 +15,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -30,7 +30,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.buffer.load.format.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -50,7 +50,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -76,7 +76,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -102,7 +102,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 @@ -138,7 +138,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED: $vgpr0 = COPY [[COPY7]] @@ -183,7 +183,7 @@ ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -247,7 +247,7 @@ ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -277,7 +277,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffsset_add_4095 @@ -292,7 +292,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -313,7 +313,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; PACKED-LABEL: name: struct_buffer_load_format_i16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -328,7 +328,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i16 @llvm.amdgcn.struct.buffer.load.format.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -15,7 +15,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -35,7 +35,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY7]] @@ -58,7 +58,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2 @@ -83,7 +83,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 @@ -132,7 +132,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -166,7 +166,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -187,7 +187,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll @@ -16,7 +16,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -37,7 +37,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY7]] @@ -61,7 +61,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub2 @@ -87,7 +87,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2 @@ -116,7 +116,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 0) @@ -137,7 +137,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 @@ -158,7 +158,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 64 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 64, i32 0) @@ -200,7 +200,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -227,7 +227,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) + ; CHECK: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -249,7 +249,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) + ; CHECK: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 1 from custom "BufferResource", addrspace 4) ; CHECK: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 8, implicit $exec ; CHECK: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -272,7 +272,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -294,7 +294,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 16, implicit $exec ; CHECK: $vgpr0 = COPY [[V_BFE_I32_e64_]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 @@ -318,7 +318,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -339,7 +339,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -366,7 +366,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY7]] @@ -390,7 +390,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll @@ -16,7 +16,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -31,7 +31,7 @@ ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -55,7 +55,7 @@ ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_v2f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -70,7 +70,7 @@ ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -103,7 +103,7 @@ ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY10]], [[COPY1]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_BOTHEN_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_v4f16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -120,7 +120,7 @@ ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -162,7 +162,7 @@ ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -206,7 +206,7 @@ ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -233,7 +233,7 @@ ; UNPACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_gfx80_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: struct_buffer_store_format_i16__vgpr_val__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; PACKED: bb.1 (%ir-block.0): @@ -248,7 +248,7 @@ ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.i16(i16 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll @@ -15,7 +15,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -37,7 +37,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -60,7 +60,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZ_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -84,7 +84,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -126,7 +126,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY9]], %subreg.sub0, [[COPY10]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY8]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -153,7 +153,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.format.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -17,7 +17,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -39,7 +39,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -62,7 +62,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX3_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -86,7 +86,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -132,7 +132,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY13]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_BOTHEN_exact [[COPY11]], [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -159,7 +159,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) + ; CHECK: BUFFER_STORE_BYTE_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "BufferResource", addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.struct.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -180,7 +180,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.struct.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -201,7 +201,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 1) ret void @@ -221,7 +221,7 @@ ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_BOTHEN_exact [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void @@ -249,7 +249,7 @@ ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_BOTHEN_exact [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE1]], [[COPY8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "BufferResource", align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll @@ -16,7 +16,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset @@ -31,7 +31,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -51,8 +51,8 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN]] + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset ; UNPACKED: bb.1 (%ir-block.0): @@ -66,7 +66,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XY_gfx80_BOTHEN]].sub1 ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 @@ -103,7 +103,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub0 ; PACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN]].sub1 ; PACKED: $vgpr0 = COPY [[COPY7]] @@ -121,7 +121,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub0 ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub1 ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN]].sub2 @@ -163,7 +163,7 @@ ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; PACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_vindex0 @@ -179,7 +179,7 @@ ; UNPACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call half @llvm.amdgcn.struct.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -220,7 +220,7 @@ ; PACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; PACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_D16_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -266,7 +266,7 @@ ; UNPACKED: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; UNPACKED: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -314,7 +314,7 @@ ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; PACKED: [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) ; PACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_BOTHEN]] ; PACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 ; UNPACKED-LABEL: name: struct_tbuffer_load_f16__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset_voffset_add4095 @@ -329,7 +329,7 @@ ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; UNPACKED: [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 2 from custom "BufferResource" + 4095, align 1, addrspace 4) ; UNPACKED: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_D16_X_gfx80_BOTHEN]] ; UNPACKED: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll @@ -15,7 +15,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -35,7 +35,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1 ; CHECK: $vgpr0 = COPY [[COPY7]] @@ -58,7 +58,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub2 @@ -83,7 +83,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1 ; CHECK: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2 @@ -111,7 +111,7 @@ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY5]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %val = call float @llvm.amdgcn.struct.tbuffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %voffset, i32 %soffset, i32 78, i32 0) @@ -152,7 +152,7 @@ ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY8]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -186,7 +186,7 @@ ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) + ; CHECK: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 4095, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4095, align 1, addrspace 4) ; CHECK: $vgpr0 = COPY [[TBUFFER_LOAD_FORMAT_X_BOTHEN]] ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 %voffset = add i32 %voffset.base, 4095 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir @@ -0,0 +1,206 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY + +--- +name: mfma_f32_32x32x4bf16_1k_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + + ; FAST-LABEL: name: mfma_f32_32x32x4bf16_1k_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FAST: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>) + ; GREEDY-LABEL: name: mfma_f32_32x32x4bf16_1k_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16.1k), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3 +... + +--- +name: mfma_f32_16x16x4bf16_1k_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + + ; FAST-LABEL: name: mfma_f32_16x16x4bf16_1k_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: mfma_f32_16x16x4bf16_1k_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4bf16.1k), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: mfma_f32_4x4x4bf16_1k_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + + ; FAST-LABEL: name: mfma_f32_4x4x4bf16_1k_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: mfma_f32_4x4x4bf16_1k_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4bf16.1k), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: mfma_f32_32x32x8bf16_1k_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + + ; FAST-LABEL: name: mfma_f32_32x32x8bf16_1k_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FAST: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>) + ; GREEDY-LABEL: name: mfma_f32_32x32x8bf16_1k_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<32 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8bf16.1k), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3 +... + +--- +name: mfma_f32_16x16x16bf16_1k_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + + ; FAST-LABEL: name: mfma_f32_16x16x16bf16_1k_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: mfma_f32_16x16x16bf16_1k_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16bf16.1k), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: mfma_f64_16x16x4f64_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + + ; FAST-LABEL: name: mfma_f64_16x16x4f64_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FAST: [[INT:%[0-9]+]]:agpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<8 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INT]](<8 x s32>) + ; GREEDY-LABEL: name: mfma_f64_16x16x4f64_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<8 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INT]](<8 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<8 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + %3:_(<8 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.16x16x4f64), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %3 +... + +--- +name: mfma_f64_4x4x4f64_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1 + + ; FAST-LABEL: name: mfma_f64_4x4x4f64_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<2 x s32>) = COPY $agpr0_agpr1 + ; FAST: [[INT:%[0-9]+]]:agpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<2 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1 = COPY [[INT]](<2 x s32>) + ; GREEDY-LABEL: name: mfma_f64_4x4x4f64_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<2 x s32>) = COPY $agpr0_agpr1 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<2 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1 = COPY [[INT]](<2 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<2 x s32>) = COPY $agpr0_agpr1 + %3:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f64.4x4x4f64), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1 = COPY %3 +... Index: llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir +++ llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir @@ -39,7 +39,7 @@ bb.1: renamable $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + BUFFER_STORE_DWORD_OFFEN killed renamable $vgpr0, undef renamable $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) bb.2: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/acc-ldst.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/acc-ldst.ll @@ -0,0 +1,316 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}test_load_mfma_store16: +; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NOT: v_accvgpr_read +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load1_mfma_store1: +; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_read +; GCN: v_mfma_f32_32x32x1f32 a{{\[}}[[N:[0-9]+]]: +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NOT: v_accvgpr_read +; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}] +define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid + %in.1 = load float, float addrspace(1)* %gep + %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3) + %elt = extractelement <32 x float> %mai.1, i32 0 + store float %elt, float addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load4_mfma_store4: +; GCN: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_write +; GCN: v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]] +; GCN-NEXT: s_nop 4 +; GCN-NOT: v_accvgpr_read +; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}] +define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr +; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32 + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1 + store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2 + ret void +} + +; GCN-LABEL: {{^}}test_load_add_mfma_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-COUNT-32: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NOT: v_accvgpr_read +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %add.1 = fadd <32 x float> %in.1, %in.1 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load_add_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr +; GCN-COUNT-16: v_pk_add_f32 +; GCN-NOT: v_accvgpr +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %add.1 = fadd <32 x float> %in.1, %in.1 + store <32 x float> %add.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load_mfma_add_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-COUNT-32: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-COUNT-32: v_accvgpr_read +; GCN: v_pk_add_f32 +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) + %add.1 = fadd <32 x float> %mai.1, %in.1 + store <32 x float> %add.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_load_add_mfma_mul_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN: v_pk_add_f32 +; GCN-COUNT-32: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-COUNT-32: v_accvgpr_read +; GCN: v_pk_mul_f32 +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] +define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %add.1 = fadd <32 x float> %in.1, %in.1 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) + %mul.1 = fmul <32 x float> %mai.1, %mai.1 + store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store: +; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-COUNT-32: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-COUNT-32: v_accvgpr_read +; GCN: v_pk_mul_f32 +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] +define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep + %add.1 = fadd <32 x float> %in.1, %in.1 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) + %mul.1 = fmul <32 x float> %mai.1, %in.1 + store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store: +; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr_read +; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32 + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1 + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) + %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1 + store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2 + ret void +} + +; NB: for atomics both vdata and vdst shall be either VGPR or AGPR +; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store: +; GCN: global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]] +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]: +; GCN: v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}} +; GCN: global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc +; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, +define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid + %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst + %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0 + %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1 + %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2 + %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3 + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0) + %elt = extractelement <4 x i32> %mai.1, i32 0 + %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst + store i32 %val, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store: +; GCN: global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]: +; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} +; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} +; GCN: global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc +define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid + %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst + %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0 + %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1 + %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32> + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0) + %elt.1 = extractelement <4 x i32> %mai.1, i32 0 + %elt.2 = extractelement <4 x i32> %mai.1, i32 1 + %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0 + %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1 + %v2 = bitcast <2 x i32> %v2.2 to i64 + %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst + store i64 %val, i64 addrspace(1)* %arg + ret void +} + +; NB: both data operands should be VGPR or AGPR +; GCN-LABEL: {{^}}test_load_mfma_ds2_store: +; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}} +; GCN-NOT: v_accvgpr_write +; GCN-DAG: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]] +; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-NOT: v_accvgpr_read +; GCN: ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128 +define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid + %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1 + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0) + %elt = extractelement <4 x i32> %mai.1, i32 0 + %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)* + %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32 + store i32 1, i32 addrspace(3)* %ptr + store i32 %elt, i32 addrspace(3)* %gep.2 + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_4xi32: +; GCN: global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_write +; GCN: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]] +; GCN-NOT: v_accvgpr_read +; GCN: global_store_dwordx4 v[{{[0-9:]+}}], [[RES]], +define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid + %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}test_mfma_loop_32xfloat: +; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] +; GCN-NOT: v_accvgpr_write +; GCN: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr_read +; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}], +; GCN: s_endpgm +define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid + %in = load <32 x float>, <32 x float> addrspace(1)* %gep + br label %for.cond.preheader + +for.cond.preheader: + %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ] + %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) + %inc = add nuw nsw i32 %c, 1 + %cc = icmp eq i32 %inc, 16 + br i1 %cc, label %exit, label %for.cond.preheader + +exit: + store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep + ret void +} Index: llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX90A %s --- | define amdgpu_kernel void @a_to_v() #0 { ret void } @@ -49,10 +50,14 @@ body: | bb.0: liveins: $agpr0 - ; GCN-LABEL: name: a_to_v - ; GCN: liveins: $agpr0 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0 + ; GFX908-LABEL: name: a_to_v + ; GFX908: liveins: $agpr0 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0 + ; GFX90A-LABEL: name: a_to_v + ; GFX90A: liveins: $agpr0 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0 $vgpr0 = COPY killed $agpr0, implicit $exec S_ENDPGM 0, implicit $vgpr0 ... @@ -64,11 +69,16 @@ bb.0: liveins: $agpr0_agpr1 - ; GCN-LABEL: name: a2_to_v2 - ; GCN: liveins: $agpr0_agpr1 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1 + ; GFX908-LABEL: name: a2_to_v2 + ; GFX908: liveins: $agpr0_agpr1 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1 + ; GFX90A-LABEL: name: a2_to_v2 + ; GFX90A: liveins: $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1 $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1 ... @@ -80,12 +90,18 @@ bb.0: liveins: $agpr0_agpr1_agpr2 - ; GCN-LABEL: name: a3_to_v3 - ; GCN: liveins: $agpr0_agpr1_agpr2 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 + ; GFX908-LABEL: name: a3_to_v3 + ; GFX908: liveins: $agpr0_agpr1_agpr2 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 + ; GFX90A-LABEL: name: a3_to_v3 + ; GFX90A: liveins: $agpr0_agpr1_agpr2 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ... @@ -96,13 +112,20 @@ body: | bb.0: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GCN-LABEL: name: a4_to_v4 - ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908-LABEL: name: a4_to_v4 + ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A-LABEL: name: a4_to_v4 + ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ... @@ -114,17 +137,28 @@ bb.0: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN-LABEL: name: a8_to_v8 - ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908-LABEL: name: a8_to_v8 + ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A-LABEL: name: a8_to_v8 + ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ... @@ -135,25 +169,44 @@ body: | bb.0: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN-LABEL: name: a16_to_v16 - ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec - ; GCN: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908-LABEL: name: a16_to_v16 + ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A-LABEL: name: a16_to_v16 + ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ... @@ -164,10 +217,14 @@ body: | bb.0: liveins: $vgpr0 - ; GCN-LABEL: name: v_to_a - ; GCN: liveins: $vgpr0 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0 + ; GFX908-LABEL: name: v_to_a + ; GFX908: liveins: $vgpr0 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0 + ; GFX90A-LABEL: name: v_to_a + ; GFX90A: liveins: $vgpr0 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0 $agpr0 = COPY killed $vgpr0, implicit $exec S_ENDPGM 0, implicit $agpr0 ... @@ -178,11 +235,16 @@ body: | bb.0: liveins: $vgpr0_vgpr1 - ; GCN-LABEL: name: v2_to_a2 - ; GCN: liveins: $vgpr0_vgpr1 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX908-LABEL: name: v2_to_a2 + ; GFX908: liveins: $vgpr0_vgpr1 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX90A-LABEL: name: v2_to_a2 + ; GFX90A: liveins: $vgpr0_vgpr1 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 ... @@ -193,12 +255,18 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2 - ; GCN-LABEL: name: v3_to_a3 - ; GCN: liveins: $vgpr0_vgpr1_vgpr2 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX908-LABEL: name: v3_to_a3 + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX90A-LABEL: name: v3_to_a3 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -209,13 +277,20 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN-LABEL: name: v4_to_a4 - ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-LABEL: name: v4_to_a4 + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-LABEL: name: v4_to_a4 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -226,17 +301,28 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN-LABEL: name: v8_to_a8 - ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-LABEL: name: v8_to_a8 + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-LABEL: name: v8_to_a8 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ... @@ -247,25 +333,44 @@ body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN-LABEL: name: v16_to_a16 - ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-LABEL: name: v16_to_a16 + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-LABEL: name: v16_to_a16 + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ... @@ -276,11 +381,16 @@ body: | bb.0: liveins: $sgpr0 - ; GCN-LABEL: name: s_to_a - ; GCN: liveins: $sgpr0 - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0 + ; GFX908-LABEL: name: s_to_a + ; GFX908: liveins: $sgpr0 + ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0 + ; GFX90A-LABEL: name: s_to_a + ; GFX90A: liveins: $sgpr0 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0 $agpr0 = COPY killed $sgpr0, implicit $exec S_ENDPGM 0, implicit $agpr0 ... @@ -291,13 +401,20 @@ body: | bb.0: liveins: $sgpr0_sgpr1 - ; GCN-LABEL: name: s2_to_a2 - ; GCN: liveins: $sgpr0_sgpr1 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX908-LABEL: name: s2_to_a2 + ; GFX908: liveins: $sgpr0_sgpr1 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1 + ; GFX90A-LABEL: name: s2_to_a2 + ; GFX90A: liveins: $sgpr0_sgpr1 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 ... @@ -308,15 +425,24 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2 - ; GCN-LABEL: name: s3_to_a3 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX908-LABEL: name: s3_to_a3 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX90A-LABEL: name: s3_to_a3 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -327,17 +453,28 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-LABEL: name: s4_to_a4 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-LABEL: name: s4_to_a4 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-LABEL: name: s4_to_a4 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ... @@ -348,21 +485,36 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN-LABEL: name: s6_to_a6 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-LABEL: name: s6_to_a6 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-LABEL: name: s6_to_a6 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ... @@ -373,25 +525,44 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN-LABEL: name: s8_to_a8 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-LABEL: name: s8_to_a8 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-LABEL: name: s8_to_a8 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ... @@ -402,41 +573,76 @@ body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-LABEL: name: s16_to_a16 - ; GCN: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-LABEL: name: s16_to_a16 + ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-LABEL: name: s16_to_a16 + ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ... @@ -446,11 +652,15 @@ tracksRegLiveness: true body: | bb.0: - ; GCN-LABEL: name: a_to_a - ; GCN: $agpr1 = IMPLICIT_DEF - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0 + ; GFX908-LABEL: name: a_to_a + ; GFX908: $agpr1 = IMPLICIT_DEF + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0 + ; GFX90A-LABEL: name: a_to_a + ; GFX90A: $agpr1 = IMPLICIT_DEF + ; GFX90A: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0 $agpr1 = IMPLICIT_DEF $agpr0 = COPY killed $agpr1, implicit $exec S_ENDPGM 0, implicit $agpr0 @@ -462,14 +672,22 @@ body: | bb.0: liveins: $agpr0_agpr1 - ; GCN-LABEL: name: a2_to_a2_kill - ; GCN: liveins: $agpr0_agpr1 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr1_agpr2 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 + ; GFX908-LABEL: name: a2_to_a2_kill + ; GFX908: liveins: $agpr0_agpr1 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr1_agpr2 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 + ; GFX90A-LABEL: name: a2_to_a2_kill + ; GFX90A: liveins: $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr1_agpr2 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 $agpr1_agpr2 = COPY killed $agpr0_agpr1, implicit $exec $agpr3 = COPY $agpr2 S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -481,15 +699,24 @@ body: | bb.0: liveins: $agpr4_agpr5_agpr6 - ; GCN-LABEL: name: a3_to_a3_nonoverlap_kill - ; GCN: liveins: $agpr4_agpr5_agpr6 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX908-LABEL: name: a3_to_a3_nonoverlap_kill + ; GFX908: liveins: $agpr4_agpr5_agpr6 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 + ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill + ; GFX90A: liveins: $agpr4_agpr5_agpr6 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ... @@ -500,15 +727,24 @@ body: | bb.0: liveins: $agpr1_agpr2_agpr3 - ; GCN-LABEL: name: a3_to_a3_overlap_kill - ; GCN: liveins: $agpr1_agpr2_agpr3 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GCN: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GCN: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 + ; GFX908-LABEL: name: a3_to_a3_overlap_kill + ; GFX908: liveins: $agpr1_agpr2_agpr3 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 + ; GFX90A-LABEL: name: a3_to_a3_overlap_kill + ; GFX90A: liveins: $agpr1_agpr2_agpr3 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3 $vgpr1 = COPY $agpr1 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 @@ -519,16 +755,26 @@ tracksRegLiveness: true body: | bb.0: - ; GCN-LABEL: name: a4_to_a4 - ; GCN: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX908-LABEL: name: a4_to_a4 + ; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 + ; GFX90A-LABEL: name: a4_to_a4 + ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 @@ -540,16 +786,26 @@ body: | bb.0: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GCN-LABEL: name: a4_to_a4_overlap - ; GCN: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 + ; GFX908-LABEL: name: a4_to_a4_overlap + ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 + ; GFX90A-LABEL: name: a4_to_a4_overlap + ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ... @@ -559,25 +815,44 @@ tracksRegLiveness: true body: | bb.0: - ; GCN-LABEL: name: a8_to_a8 - ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GCN: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-LABEL: name: a8_to_a8 + ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-LABEL: name: a8_to_a8 + ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -589,41 +864,76 @@ body: | bb.0: - ; GCN-LABEL: name: a16_to_a16 - ; GCN: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GCN: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-LABEL: name: a16_to_a16 + ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-LABEL: name: a16_to_a16 + ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 @@ -638,12 +948,17 @@ bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 - ; GCN-LABEL: name: a_to_a_spill - ; GCN: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 - ; GCN: $agpr1 = IMPLICIT_DEF - ; GCN: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr0 + ; GFX908-LABEL: name: a_to_a_spill + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 + ; GFX908: $agpr1 = IMPLICIT_DEF + ; GFX908: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr0 + ; GFX90A-LABEL: name: a_to_a_spill + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254 + ; GFX90A: $agpr1 = IMPLICIT_DEF + ; GFX90A: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr0 $agpr1 = IMPLICIT_DEF $agpr0 = COPY killed $agpr1, implicit $exec S_ENDPGM 0, implicit $agpr0 @@ -656,18 +971,30 @@ bb.0: liveins: $agpr0, $sgpr2_sgpr3 - ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple - ; GCN: liveins: $agpr0, $sgpr2_sgpr3 - ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple + ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple + ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 @@ -679,18 +1006,30 @@ bb.0: liveins: $agpr0, $sgpr2_sgpr3 - ; GCN-LABEL: name: copy_sgpr_to_agpr_tuple_kill - ; GCN: liveins: $agpr0, $sgpr2_sgpr3 - ; GCN: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GCN: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple_kill + ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill + ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 + ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 @@ -703,18 +1042,30 @@ bb.0: liveins: $agpr0, $agpr2_agpr3 - ; GCN-LABEL: name: copy_agpr_to_agpr_tuple - ; GCN: liveins: $agpr0, $agpr2_agpr3 - ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple + ; GFX908: liveins: $agpr0, $agpr2_agpr3 + ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple + ; GFX90A: liveins: $agpr0, $agpr2_agpr3 + ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 @@ -727,18 +1078,30 @@ bb.0: liveins: $agpr0, $agpr2_agpr3 - ; GCN-LABEL: name: copy_agpr_to_agpr_tuple_kill - ; GCN: liveins: $agpr0, $agpr2_agpr3 - ; GCN: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GCN: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GCN: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 - ; GCN: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec - ; GCN: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple_kill + ; GFX908: liveins: $agpr0, $agpr2_agpr3 + ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 + ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill + ; GFX90A: liveins: $agpr0, $agpr2_agpr3 + ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 Index: llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/adjust-writemask-vectorized.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Check that write mask is 0xf. + +; GCN-LABEL: {{^}}sample_2d_vectorized_use: +; GCN: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf +define amdgpu_ps <4 x float> @sample_2d_vectorized_use(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, <4 x float> %a) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %r = fadd <4 x float> %v, %a + ret <4 x float> %r +} + +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) Index: llvm/test/CodeGen/AMDGPU/agpr-csr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/agpr-csr.ll @@ -0,0 +1,206 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: {{^}}func_empty: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_empty() #0 { + ret void +} + +; GCN-LABEL: {{^}}func_areg_4: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: use agpr3 +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_areg_4() #0 { + call void asm sideeffect "; use agpr3", "~{a3}" () + ret void +} + +; GCN-LABEL: {{^}}func_areg_32: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: use agpr31 +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_areg_32() #0 { + call void asm sideeffect "; use agpr31", "~{a31}" () + ret void +} + +; GCN-LABEL: {{^}}func_areg_33: +; GFX908-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: use agpr32 +; GFX908-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_areg_33() #0 { + call void asm sideeffect "; use agpr32", "~{a32}" () + ret void +} + +; GCN-LABEL: {{^}}func_areg_64: +; GFX908-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX90A: buffer_store_dword a63, +; GCN: use agpr63 +; GFX90A: buffer_load_dword a63, +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_areg_64() #0 { + call void asm sideeffect "; use agpr63", "~{a63}" () + ret void +} + +; GCN-LABEL: {{^}}func_areg_31_63: +; GFX908-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX90A: buffer_store_dword a63, +; GCN: use agpr31, agpr63 +; GFX90A: buffer_load_dword a63, +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: s_setpc_b64 +define void @func_areg_31_63() #0 { + call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" () + ret void +} + +declare void @func_unknown() #0 + +; GCN-LABEL: {{^}}test_call_empty: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: def a[0:31] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_empty() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_empty() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_areg4: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX908: def a[0:31] +; GFX90A: def a[4:35] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_areg4() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_areg_4() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_areg32: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX908: def a[0:31] +; GFX90A: def a[32:63] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_areg32() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_areg_32() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_areg64: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GCN: def a[0:31] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_areg64() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_areg_64() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_areg31_63: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX908: def a[0:31] +; GFX90A: def a[32:63] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_areg31_63() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_areg_31_63() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_unknown: +; GCN-NOT: buffer_ +; GCN-NOT: v_accvgpr +; GFX908: def a[0:31] +; GFX90A: def a[32:63] +; GFX908-COUNT-8: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr +; GCN-NOT: buffer_ +; GCN: s_swappc_b64 +; GCN-NOT: buffer_ +; GFX90A-NOT: v_accvgpr +; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] +; GCN: s_endpgm +define amdgpu_kernel void @test_call_unknown() #0 { +bb: + %reg = call <32 x float> asm sideeffect "; def $0", "=a"() + call void @func_unknown() + store volatile <32 x float> %reg, <32 x float> addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } Index: llvm/test/CodeGen/AMDGPU/agpr-register-count.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -1,14 +1,22 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s ; GCN-LABEL: {{^}}kernel_32_agprs: -; GCN: .amdhsa_next_free_vgpr 32 +; GFX908: .amdhsa_next_free_vgpr 32 +; GFX90A: .amdhsa_next_free_vgpr 44 +; GFX90A: .amdhsa_accum_offset 12 ; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 -; GCN: TotalNumVgprs: 32 -; GCN: VGPRBlocks: 7 -; GCN: NumVGPRsForWavesPerEU: 32 +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 44 +; GFX908: VGPRBlocks: 7 +; GFX90A: VGPRBlocks: 5 +; GFX908: NumVGPRsForWavesPerEU: 32 +; GFX90A: NumVGPRsForWavesPerEU: 44 +; GFX90A: AccumOffset: 12 ; GCN: Occupancy: 8 -define amdgpu_kernel void @kernel_32_agprs() { +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2 +define amdgpu_kernel void @kernel_32_agprs() #0 { bb: call void asm sideeffect "", "~{v8}" () call void asm sideeffect "", "~{a31}" () @@ -17,27 +25,39 @@ ; GCN-LABEL: {{^}}kernel_0_agprs: ; GCN: .amdhsa_next_free_vgpr 1 +; GFX90A: .amdhsa_accum_offset 4 ; GCN: NumVgprs: 1 ; GCN: NumAgprs: 0 ; GCN: TotalNumVgprs: 1 ; GCN: VGPRBlocks: 0 ; GCN: NumVGPRsForWavesPerEU: 1 -; GCN: Occupancy: 10 -define amdgpu_kernel void @kernel_0_agprs() { +; GFX90A: AccumOffset: 4 +; GFX908: Occupancy: 10 +; GFX90A: Occupancy: 8 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 0 +define amdgpu_kernel void @kernel_0_agprs() #0 { bb: call void asm sideeffect "", "~{v0}" () ret void } ; GCN-LABEL: {{^}}kernel_40_vgprs: -; GCN: .amdhsa_next_free_vgpr 40 +; GFX908: .amdhsa_next_free_vgpr 40 +; GFX90A: .amdhsa_next_free_vgpr 56 +; GFX90A: .amdhsa_accum_offset 40 ; GCN: NumVgprs: 40 ; GCN: NumAgprs: 16 -; GCN: TotalNumVgprs: 40 -; GCN: VGPRBlocks: 9 -; GCN: NumVGPRsForWavesPerEU: 40 -; GCN: Occupancy: 6 -define amdgpu_kernel void @kernel_40_vgprs() { +; GFX908: TotalNumVgprs: 40 +; GFX90A: TotalNumVgprs: 56 +; GFX908: VGPRBlocks: 9 +; GFX90A: VGPRBlocks: 6 +; GFX908: NumVGPRsForWavesPerEU: 40 +; GFX90A: NumVGPRsForWavesPerEU: 56 +; GFX90A: AccumOffset: 40 +; GFX908: Occupancy: 6 +; GFX90A: Occupancy: 8 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 9 +define amdgpu_kernel void @kernel_40_vgprs() #0 { bb: call void asm sideeffect "", "~{v39}" () call void asm sideeffect "", "~{a15}" () @@ -47,7 +67,8 @@ ; GCN-LABEL: {{^}}func_32_agprs: ; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 -; GCN: TotalNumVgprs: 32 +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 44 define void @func_32_agprs() #0 { bb: call void asm sideeffect "", "~{v8}" () @@ -58,8 +79,9 @@ ; GCN-LABEL: {{^}}func_32_vgprs: ; GCN: NumVgprs: 32 ; GCN: NumAgprs: 9 -; GCN: TotalNumVgprs: 32 -define void @func_32_vgprs() { +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 41 +define void @func_32_vgprs() #0 { bb: call void asm sideeffect "", "~{v31}" () call void asm sideeffect "", "~{a8}" () @@ -70,21 +92,28 @@ ; GCN: NumVgprs: 1 ; GCN: NumAgprs: 0 ; GCN: TotalNumVgprs: 1 -define amdgpu_kernel void @func_0_agprs() { +define amdgpu_kernel void @func_0_agprs() #0 { bb: call void asm sideeffect "", "~{v0}" () ret void } ; GCN-LABEL: {{^}}kernel_max_gprs: -; GCN: .amdhsa_next_free_vgpr 256 +; GFX908: .amdhsa_next_free_vgpr 256 +; GFX90A: .amdhsa_next_free_vgpr 512 +; GFX90A: .amdhsa_accum_offset 256 ; GCN: NumVgprs: 256 ; GCN: NumAgprs: 256 -; GCN: TotalNumVgprs: 256 -; GCN: VGPRBlocks: 63 -; GCN: NumVGPRsForWavesPerEU: 256 +; GFX908: TotalNumVgprs: 256 +; GFX90A: TotalNumVgprs: 512 +; GFX908: VGPRBlocks: 63 +; GFX90A: VGPRBlocks: 63 +; GFX908: NumVGPRsForWavesPerEU: 256 +; GFX90A: NumVGPRsForWavesPerEU: 512 +; GFX90A: AccumOffset: 256 ; GCN: Occupancy: 1 -define amdgpu_kernel void @kernel_max_gprs() { +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63 +define amdgpu_kernel void @kernel_max_gprs() #0 { bb: call void asm sideeffect "", "~{v255}" () call void asm sideeffect "", "~{a255}" () @@ -92,14 +121,20 @@ } ; GCN-LABEL: {{^}}kernel_call_func_32_agprs: -; GCN: .amdhsa_next_free_vgpr 32 +; GFX908: .amdhsa_next_free_vgpr 32 +; GFX90A: .amdhsa_accum_offset 12 ; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 -; GCN: TotalNumVgprs: 32 -; GCN: VGPRBlocks: 7 -; GCN: NumVGPRsForWavesPerEU: 32 +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 44 +; GFX908: VGPRBlocks: 7 +; GFX90A: VGPRBlocks: 5 +; GFX908: NumVGPRsForWavesPerEU: 32 +; GFX90A: NumVGPRsForWavesPerEU: 44 +; GFX90A: AccumOffset: 12 ; GCN: Occupancy: 8 -define amdgpu_kernel void @kernel_call_func_32_agprs() { +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 2 +define amdgpu_kernel void @kernel_call_func_32_agprs() #0 { bb: call void @func_32_agprs() #0 ret void @@ -108,8 +143,9 @@ ; GCN-LABEL: {{^}}func_call_func_32_agprs: ; GCN: NumVgprs: 9 ; GCN: NumAgprs: 32 -; GCN: TotalNumVgprs: 32 -define void @func_call_func_32_agprs() { +; GFX908: TotalNumVgprs: 32 +; GFX90A: TotalNumVgprs: 44 +define void @func_call_func_32_agprs() #0 { bb: call void @func_32_agprs() #0 ret void @@ -118,17 +154,25 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: -; GCN: .amdhsa_next_free_vgpr 24 +; GFX908: .amdhsa_next_free_vgpr 24 +; GFX90A: .amdhsa_next_free_vgpr 48 +; GFX90A: .amdhsa_accum_offset 24 ; GCN: NumVgprs: 24 ; GCN: NumAgprs: 24 -; GCN: TotalNumVgprs: 24 -; GCN: VGPRBlocks: 5 -; GCN: NumVGPRsForWavesPerEU: 24 -; GCN: Occupancy: 10 -define amdgpu_kernel void @kernel_call_undef_func() { +; GFX908: TotalNumVgprs: 24 +; GFX90A: TotalNumVgprs: 48 +; GFX908: VGPRBlocks: 5 +; GFX90A: VGPRBlocks: 5 +; GFX908: NumVGPRsForWavesPerEU: 24 +; GFX90A: NumVGPRsForWavesPerEU: 48 +; GFX90A: AccumOffset: 24 +; GFX908: Occupancy: 10 +; GFX90A: Occupancy: 8 +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 5 +define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() ret void } -attributes #0 = { nounwind noinline } +attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } Index: llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll +++ llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll @@ -1,6 +1,7 @@ ; -enable-misched=false makes the register usage more predictable ; -regalloc=fast just makes the test run faster ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX9 +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A ; RUN: llc -march=amdgcn -mcpu=gfx1010 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE32 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10WGP-WAVE64 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+cumode -amdgpu-function-calls=false -enable-misched=false -regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX10CU-WAVE32 @@ -524,6 +525,9 @@ ; GCN-LABEL: {{^}}f256: ; GFX9: NumVgprs: 256 +; GFX90A: NumVgprs: 256 +; GFX90A: NumAgprs: 0 +; GFX90A: TotalNumVgprs: 256 ; GFX10WGP-WAVE32: NumVgprs: 256 ; GFX10WGP-WAVE64: NumVgprs: 256 ; GFX10CU-WAVE32: NumVgprs: 256 @@ -536,6 +540,9 @@ ; GCN-LABEL: {{^}}f512: ; GFX9: NumVgprs: 128 +; GFX90A: NumVgprs: 128 +; GFX90A: NumAgprs: 128 +; GFX90A: TotalNumVgprs: 256 ; GFX10WGP-WAVE32: NumVgprs: 256 ; GFX10WGP-WAVE64: NumVgprs: 256 ; GFX10CU-WAVE32: NumVgprs: 128 @@ -548,6 +555,9 @@ ; GCN-LABEL: {{^}}f1024: ; GFX9: NumVgprs: 64 +; GFX90A: NumVgprs: 64 +; GFX90A: NumAgprs: 64 +; GFX90A: TotalNumVgprs: 128 ; GFX10WGP-WAVE32: NumVgprs: 128 ; GFX10WGP-WAVE64: NumVgprs: 128 ; GFX10CU-WAVE32: NumVgprs: 64 Index: llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir +++ llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir @@ -90,7 +90,7 @@ DBG_VALUE renamable $sgpr6_sgpr7, $noreg, !11, !DIExpression(DW_OP_plus_uconst, 12, DW_OP_stack_value), debug-location !12 $vgpr1 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr6_sgpr7 $vgpr2 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr6_sgpr7, implicit $exec - GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, renamable $vgpr0, 12, 0, 0, 0, implicit $exec, debug-location !12 :: (store 4 into %ir.tmp2, addrspace 1) + GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, renamable $vgpr0, 12, 0, 0, 0, 0, implicit $exec, debug-location !12 :: (store 4 into %ir.tmp2, addrspace 1) renamable $sgpr4 = S_MOV_B32 8388608 renamable $sgpr4_sgpr5 = nofpexcept V_CMP_GT_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc @@ -102,7 +102,7 @@ renamable $sgpr4_sgpr5 = IMPLICIT_DEF $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5 $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $exec - renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) + renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`, addrspace 1) renamable $sgpr4 = S_MOV_B32 2139095040 S_WAITCNT 3952 renamable $sgpr4_sgpr5 = nofpexcept V_CMP_NEQ_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir +++ llvm/test/CodeGen/AMDGPU/break-smem-soft-clauses.mir @@ -326,11 +326,11 @@ bb.0: ; GCN-LABEL: name: flat_inst_breaks_smem_clause ; GCN: $sgpr0 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr12_sgpr13, 0, 0, 0 ; GCN-NEXT: S_ENDPGM 0 $sgpr0 = S_LOAD_DWORD_IMM $sgpr10_sgpr11, 0, 0, 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $sgpr2 = S_LOAD_DWORD_IMM $sgpr12_sgpr13, 0, 0, 0 S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ llvm/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -9,10 +9,10 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_flat4_x1 - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -22,12 +22,12 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_flat4_x2 - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -37,14 +37,14 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_flat4_x3 - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -54,16 +54,16 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_flat4_x4 - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -73,12 +73,12 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_flat4_x2_sameptr - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -88,10 +88,10 @@ body: | bb.0: ; GCN-LABEL: name: flat_load4_overwrite_ptr_lo - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -101,10 +101,10 @@ body: | bb.0: ; GCN-LABEL: name: flat_load4_overwrite_ptr_hi - ; GCN: $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -114,10 +114,10 @@ body: | bb.0: ; GCN-LABEL: name: flat_load8_overwrite_ptr - ; GCN: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -130,48 +130,48 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_at_max_clause_size_flat_load4 - ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18 ; GCN-NEXT: S_ENDPGM 0 - $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr5 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr6 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr7 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr8 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr9 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr10 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr12 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr13 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr14 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr15 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr16 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr17 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $sgpr0 = S_MOV_B32 $sgpr0, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18 S_ENDPGM 0 ... @@ -182,13 +182,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_flat4_lo_ptr - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -198,13 +198,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_flat4_hi_ptr - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -214,13 +214,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_flat8_ptr - ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -231,12 +231,12 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_flat16_ptr - ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2_vgpr3_vgpr4_vgpr5 = FLAT_LOAD_DWORDX4 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -251,17 +251,17 @@ ; GCN-LABEL: name: break_clause_block_boundary_load_flat8_ptr ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN: bb.1: ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 bb.0: - $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr bb.1: - $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2_vgpr3 = FLAT_LOAD_DWORDX2 $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -272,12 +272,12 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_store_load_into_ptr_flat4 - ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -289,12 +289,12 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_store_load_into_data_flat4 - ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -305,14 +305,14 @@ body: | bb.0: ; GCN-LABEL: name: valu_inst_breaks_clause - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr8 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr8 = V_MOV_B32_e32 0, implicit $exec - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -323,14 +323,14 @@ body: | bb.0: ; GCN-LABEL: name: salu_inst_breaks_clause - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $sgpr8 = S_MOV_B32 0 - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $sgpr8 = S_MOV_B32 0 - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -340,14 +340,14 @@ body: | bb.0: ; GCN-LABEL: name: ds_inst_breaks_clause - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $vgpr8 = DS_READ_B32 $vgpr9, 0, 0, implicit $m0, implicit $exec - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr8 = DS_READ_B32 $vgpr9, 0, 0, implicit $m0, implicit $exec - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -357,14 +357,14 @@ body: | bb.0: ; GCN-LABEL: name: smrd_inst_breaks_clause - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: $sgpr8 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $sgpr8 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -374,13 +374,13 @@ body: | bb.0: ; GCN-LABEL: name: implicit_use_breaks_clause - ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5 + ; GCN: $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5 ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5 - $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr4_vgpr5 + $vgpr4_vgpr5 = FLAT_LOAD_DWORDX2 $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -389,12 +389,12 @@ body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -403,13 +403,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -420,11 +420,11 @@ body: | bb.0: ; GCN-LABEL: name: mubuf_load4_overwrite_ptr - ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec S_ENDPGM 0 @@ -437,13 +437,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_flat_load_mubuf_load - ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... # Break a clause from interference between mubuf and flat instructions @@ -458,8 +458,8 @@ body: | bb.0: - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -470,13 +470,13 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_flat4 - ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_ATOMIC_ADD_RTN $vgpr5_vgpr6, $vgpr7, 0, 1, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -485,12 +485,12 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_flat4 - ; GCN: FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; GCN-NEXT: S_ENDPGM 0 - FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_ADD $vgpr0_vgpr1, $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -500,12 +500,12 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 1, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 1, 0, implicit $exec S_ENDPGM 0 ... @@ -517,11 +517,11 @@ bb.0: ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 ; GCN: BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -532,11 +532,11 @@ body: | bb.0: ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -546,16 +546,16 @@ body: | bb.0: ; GCN-LABEL: name: mix_load_store_clause - ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr2_vgpr3, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -565,15 +565,15 @@ body: | bb.0: ; GCN-LABEL: name: mix_load_store_clause_same_address - ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr10 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr11 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -10,26 +10,26 @@ ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 ; GCN: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6) - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 16, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 16, align 1, addrspace 4) ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 32, align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 32, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 48, align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 48, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 64, align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 64, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 80, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) @@ -39,16 +39,20 @@ ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 ; GCN: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 96, align 1, addrspace 4) - ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0 + ; GCN: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0 ; GCN: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0 + ; GCN: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[DEF1]] + ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]].sub0 ; GCN: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0 + ; GCN: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[DEF2]] + ; GCN: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]].sub0 ; GCN: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0 + ; GCN: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[DEF3]] + ; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0 ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec ; GCN: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 112, align 1, addrspace 4) @@ -56,23 +60,23 @@ ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 128, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY10]], 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 144, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY11]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 160, align 1, addrspace 4) ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 @@ -80,148 +84,160 @@ ; GCN: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 160, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4) - ; GCN: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0 + ; GCN: [[COPY13:%[0-9]+]]:vreg_64 = COPY [[DEF4]] + ; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY13]].sub0 ; GCN: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 ; GCN: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4) - ; GCN: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0 + ; GCN: [[COPY15:%[0-9]+]]:vreg_64 = COPY [[DEF5]] + ; GCN: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY15]].sub0 ; GCN: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 ; GCN: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 176, align 1, addrspace 4) - ; GCN: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0 + ; GCN: [[COPY17:%[0-9]+]]:vreg_64 = COPY [[DEF6]] + ; GCN: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0 ; GCN: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0 - ; GCN: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[COPY19:%[0-9]+]]:vreg_64 = COPY [[DEF7]] + ; GCN: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0 + ; GCN: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0 + ; GCN: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 176, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY22:%[0-9]+]]:vreg_64 = COPY [[DEF8]] + ; GCN: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY22]].sub0 ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) ; GCN: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) ; GCN: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 192, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY24:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY24]], 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) ; GCN: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) ; GCN: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 208, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY25:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY25]], 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) + ; GCN: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) ; GCN: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 - ; GCN: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) + ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY27]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) ; GCN: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 - ; GCN: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) + ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 224, align 1, addrspace 4) ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY30:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[COPY30]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) + ; GCN: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) ; GCN: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 - ; GCN: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) + ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY32]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) ; GCN: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 - ; GCN: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource" + 240, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY35:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[S_LOAD_DWORDX4_IMM]], [[COPY35]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) - ; GCN: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) + ; GCN: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) + ; GCN: [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY37]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) ; GCN: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 - ; GCN: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) + ; GCN: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 256, align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) - ; GCN: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0 + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY41]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) + ; GCN: [[COPY42:%[0-9]+]]:vreg_64 = COPY [[DEF9]] + ; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[COPY42]].sub0 ; GCN: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 - ; GCN: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) - ; GCN: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0 + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) + ; GCN: [[COPY45:%[0-9]+]]:vreg_64 = COPY [[DEF10]] + ; GCN: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[COPY45]].sub0 ; GCN: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 - ; GCN: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; GCN: [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) - ; GCN: [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0 + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource" + 272, align 1, addrspace 4) + ; GCN: [[COPY48:%[0-9]+]]:vreg_64 = COPY [[DEF11]] + ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[COPY48]].sub0 ; GCN: [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0 - ; GCN: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: [[COPY50:%[0-9]+]]:vreg_64 = COPY [[DEF12]] + ; GCN: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[COPY50]].sub0 + ; GCN: [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]] ; GCN: [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0 + ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY54:%[0-9]+]]:vreg_64 = COPY [[DEF13]] + ; GCN: [[COPY55:%[0-9]+]]:vgpr_32 = COPY [[COPY54]].sub0 ; GCN: [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0 + ; GCN: [[COPY56:%[0-9]+]]:vreg_64 = COPY [[DEF14]] + ; GCN: [[COPY57:%[0-9]+]]:vgpr_32 = COPY [[COPY56]].sub0 ; GCN: [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; GCN: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, -1, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0 + ; GCN: [[COPY58:%[0-9]+]]:vreg_64 = COPY [[DEF15]] + ; GCN: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[COPY58]].sub0 ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) - ; GCN: [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) + ; GCN: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) + ; GCN: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) ; GCN: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 - ; GCN: [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY62:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY62]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 288, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY64:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY63]], [[S_LOAD_DWORDX4_IMM]], [[COPY64]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN: [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) + ; GCN: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY65]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) ; GCN: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 - ; GCN: [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) + ; GCN: [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY66]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) ; GCN: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 - ; GCN: [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) - ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY67:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY67]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource" + 304, align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN: [[COPY69:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[COPY69]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "BufferResource", align 1, addrspace 4) ; GCN: S_ENDPGM 0 bb.0: %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0 Index: llvm/test/CodeGen/AMDGPU/bundle-latency.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/bundle-latency.mir +++ llvm/test/CodeGen/AMDGPU/bundle-latency.mir @@ -10,14 +10,14 @@ bb.0: ; GCN-LABEL: name: src_bundle_latency ; GCN: $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: $vgpr6 = V_ADD_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec ; GCN: $vgpr5 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec $vgpr0, $vgpr1 = BUNDLE undef $vgpr3_vgpr4, implicit $exec { - $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec } $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec $vgpr6 = V_ADD_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec @@ -32,13 +32,13 @@ ; GCN: $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec ; GCN: $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec ; GCN: BUNDLE killed $vgpr0, killed $vgpr1, undef $vgpr3_vgpr4, implicit $exec { - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr1, 0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, killed $vgpr0, 4, 0, 0, 0, 0, implicit $exec ; GCN: } $vgpr0 = V_ADD_F32_e32 undef $vgpr5, undef $vgpr5, implicit $mode, implicit $exec $vgpr1 = V_ADD_F32_e32 undef $vgpr6, undef $vgpr6, implicit $mode, implicit $exec BUNDLE $vgpr0, $vgpr1, undef $vgpr3_vgpr4, implicit $exec { - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec } ... Index: llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll @@ -0,0 +1,743 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7,UNPACKED-TID %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=-xnack -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,PACKED-TID %s + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_x() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_y: +; GCN: s_waitcnt +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_z: +; GCN: s_waitcnt +; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_z() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xy: +; GCN: s_waitcnt +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xyz: +; GCN: s_waitcnt +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + store volatile i32 %val2, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xz: +; GCN: s_waitcnt +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_yz: +; GCN: s_waitcnt +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10 +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.y() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: + +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 + +; GCN: .amdhsa_system_vgpr_workitem_id 0 +define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: + +; GCN-NOT: v0 +; GCN-NOT: v1 +; UNPACKED-TID: v_lshlrev_b32_e32 v0, 10, v1 +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 1 +define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: + +; GCN-NOT: v0 +; GCN-NOT: v2 +; UNPACKED-TID: v_lshlrev_b32_e32 v0, 20, v2 +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy: +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 +; UNPACKED-TID: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDY]] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 { + call void @use_workitem_id_xy() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz: +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v2 +; UNPACKED-TID: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; UNPACKED-TID: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 { + call void @use_workitem_id_xz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz: +; UNPACKED-TID-NOT: v1 +; UNPACKED-TID-NOT: v2 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; UNPACKED-TID: v_or_b32_e32 v0, [[IDY]], [[IDZ]] +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 { + call void @use_workitem_id_yz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz: +; UNPACKED-TID-NOT: v0 +; UNPACKED-TID-NOT: v1 +; UNPACKED-TID-NOT: v2 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2 +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDY]] +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, [[IDZ]] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 { + call void @use_workitem_id_xyz() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: +; GCN: s_waitcnt +; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] +define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: +; GCN: s_waitcnt +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] +define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: +; GCN: s_waitcnt +; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]] +define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: + +; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 0 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { + call void @other_arg_use_workitem_id_x(i32 555) + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: + +; UNPACKED-TID: v_lshlrev_b32_e32 v1, 10, v1 +; PACKED-TID: v_mov_b32_e32 v1, v0 +; GCN-NOT: v1 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN-NOT: v1 +; GCN: s_swappc_b64 +; GCN-NOT: v0 + +; GCN: .amdhsa_system_vgpr_workitem_id 1 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { + call void @other_arg_use_workitem_id_y(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: + +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 20, v2 +; PACKED-TID-DAG: v_mov_b32_e32 v1, v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 + +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { + call void @other_arg_use_workitem_id_z(i32 555) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v32, 0x3ff, v32 +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; GCN: s_setpc_b64 +define void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: + +; GCN: s_mov_b32 s32, 0 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 0 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: +; GCN: s_mov_b32 s33, s32 +; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { + store volatile i32 %arg0, i32 addrspace(1)* undef + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; Requires loading and storing to stack slot. +; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: +; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} +; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}} + +; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} + +; GCN: s_swappc_b64 + +; GCN: s_sub_u32 s32, s32, 0x400{{$}} +; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN: s_setpc_b64 +define void @too_many_args_call_too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + call void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) + ret void +} + +; stack layout: +; frame[0] = byval arg32 +; frame[1] = stack passed workitem ID x +; frame[2] = VGPR spill slot + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32 +; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 +; GFX7: buffer_load_dword v0, off, s[0:3], s32 glc{{$}} +; GFX90A: buffer_load_dword v0, off, s[0:3], s32 glc scc{{$}} +; GCN: s_setpc_b64 +define void @too_many_args_use_workitem_id_x_byval( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32 addrspace(5)* byval(i32) %arg32) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + %private = load volatile i32, i32 addrspace(5)* %arg32 + ret void +} + +; sp[0] = byval +; sp[1] = ?? +; sp[2] = stack passed workitem ID x + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN-DAG: s_movk_i32 s32, 0x400 + +; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 + +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 0 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 999, i32 addrspace(5)* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32 addrspace(5)* %alloca) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GFX7: buffer_store_dword [[K]], off, s[0:3], s33{{$}} +; GFX90A: buffer_store_dword [[K]], off, s[0:3], s33 scc{{$}} +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4, addrspace(5) + store volatile i32 999, i32 addrspace(5)* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32 addrspace(5)* %alloca) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: +; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}} +; GCN: v_and_b32_e32 v33, 0x3ff, v32 +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33 +; GFX7: v_bfe_u32 v33, v32, 10, 10 +; GFX90A: v_bfe_u32 v34, v32, 10, 10 +; GCN: v_bfe_u32 v32, v32, 20, 10 +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v33{{$}} +; GFX7: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v33, off scc{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v34, off scc{{$}} +; GFX90A: global_store_dword v{{\[[0-9]+:[0-9]+]}}, v32, off scc{{$}} + +; GFX7-COUNT-32: flat_store_dword v{{\[[0-9]+:[0-9]+]}} +; GFX90A-COUNT-32: global_store_dword v{{\[[0-9]+:[0-9]+]}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @too_many_args_use_workitem_id_xyz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; frame[0] = ID { Z, Y, X } + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: + +; GCN-DAG: s_mov_b32 s32, 0 + +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v2 +; PACKED-TID-NOT: v0 +; PACKED-TID-NOT: v1 +; PACKED-TID-NOT: v2 +; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}} +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { + call void @too_many_args_use_workitem_id_xyz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; workitem ID X in register, yz on stack +; v31 = workitem ID X +; frame[0] = workitem { Z, Y, X } + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: +; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31 +; GCN-DAG: {{flat|global}}_store_dword v[0:1], [[IDX]] +; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]] +; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 +; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] + +; GCN-COUNT-31: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}} +; GCN-NEXT: s_waitcnt +; GCN: s_setpc_b64 +; GCN: ScratchSize: 0 +define void @too_many_args_use_workitem_id_x_stack_yz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + + ret void +} + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: + +; GCN-NOT: v0 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v1, 10, v1 +; UNPACKED-TID-DAG: v_or_b32_e32 v0, v0, v1 +; UNPACKED-TID-DAG: v_lshlrev_b32_e32 v2, 20, v2 +; UNPACKED-TID-DAG: v_or_b32_e32 v31, v0, v2 +; PACKED-TID: v_mov_b32_e32 v31, v0 + +; GCN: s_mov_b32 s32, 0 +; GCN: s_swappc_b64 + +; GCN: .amdhsa_system_vgpr_workitem_id 2 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { + call void @too_many_args_use_workitem_id_x_stack_yz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +attributes #0 = { nounwind readnone speculatable "amdgpu-flat-work-group-size"="1,512" } +attributes #1 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" } Index: llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ llvm/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -55,10 +55,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -117,10 +117,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -180,10 +180,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -245,10 +245,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -322,10 +322,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,10 +387,10 @@ %26 = V_LSHL_B64_e64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $mode, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir +++ llvm/test/CodeGen/AMDGPU/cluster-flat-loads-postra.mir @@ -17,15 +17,15 @@ $vgpr0_vgpr1 = IMPLICIT_DEF $vgpr4_vgpr5 = IMPLICIT_DEF - $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) - $vgpr4 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + $vgpr4 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) $vgpr2 = IMPLICIT_DEF $vgpr3 = IMPLICIT_DEF $vgpr6 = IMPLICIT_DEF $vgpr0 = V_ADD_CO_U32_e32 16, $vgpr2, implicit-def $vcc, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr3, killed $vgpr6, implicit-def dead $vcc, implicit $vcc, implicit $exec - FLAT_STORE_DWORD $vgpr2_vgpr3, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) - FLAT_STORE_DWORD $vgpr0_vgpr1, killed $vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD $vgpr2_vgpr3, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD $vgpr0_vgpr1, killed $vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir +++ llvm/test/CodeGen/AMDGPU/cluster-flat-loads.mir @@ -14,7 +14,7 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec - %3 = FLAT_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ... Index: llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/coalesce-vgpr-alignment.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; Check that register coalescer does not create an odd subreg when register tuples +; must be aligned. + +; GCN-LABEL: {{^}}test_odd_int4: +; GCN: global_load_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v{{[0-9]+}}, s[{{[0-9:]+}}] +; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}} +; GCN: global_store_dwordx2 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]], s[{{[0-9:]+}}] + +define amdgpu_kernel void @test_odd_int4(<4 x i32> addrspace(1)* %arg, <2 x i32> addrspace(1)* %arg1) { +bb: + %lid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %lid + %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1, align 16 + %shuffle = shufflevector <4 x i32> %load, <4 x i32> undef, <2 x i32> + %gep2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %arg1, i32 %lid + store <2 x i32> %shuffle, <2 x i32> addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_vector_creation: +; GCN: global_load_dwordx2 v[{{[0-9]*[02468]}}:{{[0-9]+}}], +; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[LO:[02468]]], v{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]*}}[[HI:[13579]]], v{{[0-9]+}} +; GCN: global_store_dwordx4 v[{{[0-9]*[02468]:[0-9]*[13579]}}], v[{{[0-9]*[02468]:[0-9]*[13579]}}] +define amdgpu_kernel void @test_vector_creation() { +entry: + %tmp231 = load <4 x i16>, <4 x i16> addrspace(1)* undef, align 2 + %vext466 = shufflevector <4 x i16> %tmp231, <4 x i16> undef, <8 x i32> + %vecinit467 = shufflevector <8 x i16> undef, <8 x i16> %vext466, <8 x i32> + %vecinit471 = shufflevector <8 x i16> %vecinit467, <8 x i16> undef, <8 x i32> + store <8 x i16> %vecinit471, <8 x i16> addrspace(1)* undef, align 16 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir @@ -30,7 +30,7 @@ %14:vgpr_32 = V_AND_B32_e32 1, %13, implicit $exec %15:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %14, implicit $exec %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %15, implicit $exec - BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) S_ENDPGM 0 bb.2: @@ -78,7 +78,7 @@ bb.8: successors: %bb.10 - %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %34:sreg_64_xexec = V_CMP_NE_U32_e64 0, %31, implicit $exec %35:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %34, implicit $exec %28:vgpr_32 = COPY %35 Index: llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir @@ -83,7 +83,7 @@ bb.9: successors: %bb.10(0x80000000) - %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %21:sreg_64 = V_CMP_NE_U32_e64 target-flags(amdgpu-gotprel) 0, killed %19.sub0, implicit $exec %22:sreg_64 = COPY $exec, implicit-def $exec %23:sreg_64 = S_AND_B64 %22, %21, implicit-def dead $scc @@ -125,7 +125,7 @@ %27.sub5:sgpr_256 = COPY %26 %27.sub6:sgpr_256 = COPY %26 %27.sub7:sgpr_256 = COPY killed %26 - %28:vgpr_32 = IMAGE_LOAD_V1_V4 killed %25, killed %27, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) + %28:vgpr_32 = IMAGE_LOAD_V1_V4 killed %25, killed %27, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) %29:vgpr_32 = nofpexcept V_ADD_F32_e32 0, killed %28, implicit $mode, implicit $exec $m0 = S_MOV_B32 -1 DS_WRITE_B32 undef %30:vgpr_32, killed %29, 0, 0, implicit $m0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`, addrspace 3) Index: llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir @@ -68,7 +68,7 @@ %23:vreg_128 = COPY killed %17 %24:sreg_64 = COPY killed %16 %25:vgpr_32 = V_OR_B32_e32 %22, %11, implicit $exec - %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %28:vgpr_32 = V_LSHRREV_B32_e32 30, killed %26.sub0, implicit $exec %29:vreg_128 = COPY killed %21 %29.sub0:vreg_128 = COPY %1 @@ -257,7 +257,7 @@ %109.sub5:sgpr_256 = COPY %108 %109.sub6:sgpr_256 = COPY %108 %109.sub7:sgpr_256 = COPY killed %108 - %110:vgpr_32 = IMAGE_SAMPLE_V1_V2 killed %107, killed %109, undef %111:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) + %110:vgpr_32 = IMAGE_SAMPLE_V1_V2 killed %107, killed %109, undef %111:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) %112:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %110, implicit $mode, implicit $exec %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %112, implicit $mode, implicit $exec %114:vgpr_32 = nofpexcept V_MAD_F32_e64 0, killed %113, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-subreg-join.mir @@ -61,7 +61,7 @@ %11.sub6 = COPY %1 %11.sub7 = COPY %1 %11.sub8 = COPY %1 - dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4) + dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4) %20.sub1 = COPY %2 %20.sub2 = COPY %2 %20.sub3 = COPY %2 @@ -70,6 +70,6 @@ %20.sub6 = COPY %2 %20.sub7 = COPY %2 %20.sub8 = COPY %2 - dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4) + dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 4) ... Index: llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir @@ -11,7 +11,7 @@ # # GCN-LABEL: bb.6: # GCN: successors: %bb.7(0x{{[0-9]+}}), %bb.18(0x{{[0-9]+}}) -# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # --- | @@ -69,7 +69,7 @@ %10:sreg_64 = COPY killed %5 undef %11.sub2:sgpr_128 = COPY %4 %11.sub3:sgpr_128 = COPY %3 - %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec undef %13.sub1:vreg_128 = COPY %9.sub1 %13.sub2:vreg_128 = COPY %9.sub2 %14:sreg_64 = nofpexcept V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $mode, implicit $exec @@ -161,7 +161,7 @@ bb.18: successors: %bb.7(0x80000000) dead %59:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $mode, implicit $exec - dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sgpr_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec + dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sgpr_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, 0, implicit $exec undef %66.sub1:vreg_128 = COPY %13.sub1 %66.sub2:vreg_128 = COPY %13.sub2 %67:sreg_64 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir +++ llvm/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir @@ -47,7 +47,7 @@ %4.sub5:sgpr_256 = COPY %1 %4.sub6:sgpr_256 = COPY %1 %4.sub7:sgpr_256 = COPY killed %1 - %5:vgpr_32 = IMAGE_LOAD_V1_V4 killed %3, killed %4, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) + %5:vgpr_32 = IMAGE_LOAD_V1_V4 killed %3, killed %4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) %6:vgpr_32 = nofpexcept V_MAD_F32_e64 0, killed %5, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec %7:vgpr_32 = nofpexcept V_RCP_F32_e32 killed %6, implicit $mode, implicit $exec %8:vgpr_32 = nofpexcept V_MUL_F32_e32 0, killed %7, implicit $mode, implicit $exec @@ -148,7 +148,7 @@ %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sgpr_128, 12, 0, 0 :: (dereferenceable invariant load 4) %45:vgpr_32 = V_MUL_LO_I32_e64 killed %42, killed %43, implicit $exec %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec - %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %49:sreg_64 = V_CMP_NE_U32_e64 0, killed %47, implicit $exec %50:sreg_64 = COPY $exec, implicit-def $exec %51:sreg_64 = S_AND_B64 %50, %49, implicit-def dead $scc Index: llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir @@ -42,7 +42,7 @@ ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -54,7 +54,7 @@ ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -91,7 +91,7 @@ %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -103,7 +103,7 @@ %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc Index: llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -37,7 +37,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_AND_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -85,19 +85,19 @@ %16:vgpr_32 = V_MOV_B32_e32 63, implicit $exec %9:vgpr_32 = V_AND_B32_e64 %8, %6, implicit $exec - FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %10:vgpr_32 = V_AND_B32_e64 %6, %8, implicit $exec - FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %11:vgpr_32 = V_AND_B32_e32 %8, %6, implicit $exec - FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_AND_B32_e64 %8, %8, implicit $exec - FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %13:vgpr_32 = V_AND_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -116,7 +116,6 @@ bb.0: liveins: $sgpr0_sgpr1 - %0:sgpr_64 = COPY $sgpr0_sgpr1 %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0, 0 %5:sreg_32_xm0_xexec = S_MOV_B32 1 @@ -127,7 +126,7 @@ %10:sgpr_128 = REG_SEQUENCE killed %7, %subreg.hi16, killed %6, %subreg.lo16, killed %9, %subreg.sub0, killed %8, %subreg.sub0_sub1 %12:sreg_32_xm0 = S_LSHL_B32 killed %5, 12, implicit-def dead $scc %13:vgpr_32 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -154,7 +153,7 @@ %8:sgpr_128 = REG_SEQUENCE killed %5, %subreg.hi16, killed %4, %subreg.lo16, killed %7, %subreg.sub0, killed %6, %subreg.sub0_sub1 %10:sreg_32_xm0 = S_ASHR_I32 killed %3, 12, implicit-def dead $scc %11:vgpr_32 = COPY %10 - BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -222,34 +221,34 @@ %32:vgpr_32 = V_MOV_B32_e32 2, implicit $exec %11:vgpr_32 = V_ASHRREV_I32_e64 8, %10, implicit $exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_ASHRREV_I32_e64 %8, %10, implicit $exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %13:vgpr_32 = V_ASHR_I32_e64 %7, 3, implicit $exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %14:vgpr_32 = V_ASHR_I32_e64 7, %29, implicit $exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %15:vgpr_32 = V_ASHR_I32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_ASHR_I32_e64 %6, 4, implicit $exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %23:vgpr_32 = V_ASHR_I32_e64 %6, %30, implicit $exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %25:vgpr_32 = V_ASHR_I32_e32 %31, %31, implicit $exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %26:vgpr_32 = V_ASHRREV_I32_e32 11, %10, implicit $exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %28:vgpr_32 = V_ASHR_I32_e32 %27, %32, implicit $exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -277,7 +276,7 @@ %8:sgpr_128 = REG_SEQUENCE killed %5, %subreg.hi16, killed %4, %subreg.lo16, killed %7, %subreg.sub0, killed %6, %subreg.sub0_sub1 %10:sreg_32_xm0 = S_LSHR_B32 killed %3, 12, implicit-def dead $scc %11:vgpr_32 = COPY %10 - BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %11, killed %8, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -346,34 +345,34 @@ %32:vgpr_32 = V_MOV_B32_e32 2, implicit $exec %11:vgpr_32 = V_LSHRREV_B32_e64 8, %10, implicit $exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_LSHRREV_B32_e64 %8, %10, implicit $exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %13:vgpr_32 = V_LSHR_B32_e64 %7, 3, implicit $exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %14:vgpr_32 = V_LSHR_B32_e64 7, %29, implicit $exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %15:vgpr_32 = V_LSHR_B32_e64 %27, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHR_B32_e64 %6, 4, implicit $exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %23:vgpr_32 = V_LSHR_B32_e64 %6, %30, implicit $exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %25:vgpr_32 = V_LSHR_B32_e32 %31, %31, implicit $exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %26:vgpr_32 = V_LSHRREV_B32_e32 11, %10, implicit $exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %28:vgpr_32 = V_LSHR_B32_e32 %27, %32, implicit $exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -391,7 +390,7 @@ bb.0: %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %2:vgpr_32 = V_XOR_B32_e64 killed %0, undef %1:vgpr_32, implicit $exec - FLAT_STORE_DWORD undef %3:vreg_64, %2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD undef %3:vreg_64, %2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -445,7 +444,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_ANDN2_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -478,7 +477,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_OR_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -530,15 +529,15 @@ %8:sreg_32_xm0 = S_MOV_B32 1234567 %16:vgpr_32 = V_MOV_B32_e32 63, implicit $exec %9:vgpr_32 = V_OR_B32_e64 %8, %6, implicit $exec - FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %10:vgpr_32 = V_OR_B32_e64 %6, %8, implicit $exec - FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %11:vgpr_32 = V_OR_B32_e32 %8, %6, implicit $exec - FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_OR_B32_e64 %8, %8, implicit $exec - FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %13:vgpr_32 = V_OR_B32_e64 %16, %16, implicit $exec - FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %19, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -571,7 +570,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_ORN2_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -604,7 +603,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_NAND_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -637,7 +636,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_NOR_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -670,7 +669,7 @@ %8:sreg_32_xm0 = S_MOV_B32 9999 %9:sreg_32_xm0 = S_XNOR_B32 killed %7, killed %8, implicit-def dead $scc %10:vgpr_32 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -739,25 +738,25 @@ %7:sreg_32_xm0 = S_MOV_B32 1 %27:sreg_32_xm0 = S_MOV_B32 -4 %11:vgpr_32 = V_LSHLREV_B32_e64 12, %10, implicit $exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %12:vgpr_32 = V_LSHLREV_B32_e64 %7, 12, implicit $exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %13:vgpr_32 = V_LSHL_B32_e64 %7, 12, implicit $exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %14:vgpr_32 = V_LSHL_B32_e64 12, %7, implicit $exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %14, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %15:vgpr_32 = V_LSHL_B32_e64 12, %24, implicit $exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %22:vgpr_32 = V_LSHL_B32_e64 %6, 12, implicit $exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %22, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %23:vgpr_32 = V_LSHL_B32_e64 %6, 32, implicit $exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %23, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %25:vgpr_32 = V_LSHL_B32_e32 %6, %6, implicit $exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %25, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %26:vgpr_32 = V_LSHLREV_B32_e32 11, %24, implicit $exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %26, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %28:vgpr_32 = V_LSHL_B32_e32 %27, %6, implicit $exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD %20, %28, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -0,0 +1,160 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX90A %s + +--- +name: copy_v64_to_v64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr2_vgpr3 + ; GFX908-LABEL: name: copy_v64_to_v64 + ; GFX908: liveins: $vgpr2_vgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_v64_to_v64 + ; GFX90A: liveins: $vgpr2_vgpr3 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: copy_s64_to_v64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr2_sgpr3 + ; GFX908-LABEL: name: copy_s64_to_v64 + ; GFX908: liveins: $sgpr2_sgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_s64_to_v64 + ; GFX90A: liveins: $sgpr2_sgpr3 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $sgpr2_sgpr3, implicit $exec +... + +--- +name: copy_a64_to_v64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr2_agpr3 + ; GFX908-LABEL: name: copy_a64_to_v64 + ; GFX908: liveins: $agpr2_agpr3 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX90A-LABEL: name: copy_a64_to_v64 + ; GFX90A: liveins: $agpr2_agpr3 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec +... + +--- +name: copy_v128_to_v128_fwd +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX908-LABEL: name: copy_v128_to_v128_fwd + ; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX90A-LABEL: name: copy_v128_to_v128_fwd + ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec +... + +--- +name: copy_v128_to_v128_back +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908-LABEL: name: copy_v128_to_v128_back + ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_v128_to_v128_back + ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec +... + +--- +name: copy_v96_to_v96 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr4_vgpr5_vgpr6 + ; GFX908-LABEL: name: copy_v96_to_v96 + ; GFX908: liveins: $vgpr4_vgpr5_vgpr6 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX90A-LABEL: name: copy_v96_to_v96 + ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec +... + +--- +name: copy_v64_to_v64_undef_sub0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr3 + ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0 + ; GFX908: liveins: $vgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0 + ; GFX90A: liveins: $vgpr3 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: copy_v64_to_v64_undef_sub1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr2 + ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1 + ; GFX908: liveins: $vgpr2 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1 + ; GFX90A: liveins: $vgpr2 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec +... + +--- +name: copy_s128_to_v128_killed +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908-LABEL: name: copy_s128_to_v128_killed + ; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A-LABEL: name: copy_s128_to_v128_killed + ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 +... Index: llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir +++ llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir @@ -37,7 +37,7 @@ ; GCN: S_BRANCH %bb.3 ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + ; GCN: dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) ; GCN: dead %18:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec ; GCN: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc ; GCN: dead %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -89,7 +89,7 @@ S_BRANCH %bb.3 bb.3: - dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) dead %60:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec %36:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc dead %67:vgpr_32 = V_MOV_B32_e32 0, implicit $exec Index: llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir +++ llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir @@ -66,9 +66,9 @@ ; CHECK: dead %16:vgpr_32 = COPY %11.sub0 ; CHECK: undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec ; CHECK: dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) ; CHECK: dead %20:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + ; CHECK: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] @@ -79,8 +79,8 @@ ; CHECK: bb.4: ; CHECK: successors: %bb.5(0x80000000) ; CHECK: dead %21:sreg_64 = COPY $exec - ; CHECK: dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) - ; CHECK: DBG_VALUE %22, + ; CHECK: dead %22:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: DBG_VALUE %22, $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>) ; CHECK: bb.5: ; CHECK: successors: %bb.3(0x40000000), %bb.1(0x40000000) ; CHECK: S_CBRANCH_EXECZ %bb.3, implicit $exec @@ -109,9 +109,9 @@ dead %16:vgpr_32 = COPY %11.sub0 undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 %6.sub0, %8.sub0, 0, implicit $exec dead %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 %6.sub1, %8.sub1, %18, 0, implicit $exec - %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %3, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) + %6:vreg_64 = GLOBAL_LOAD_DWORDX2 %3, 0, 0, 0, 0, 0, implicit $exec :: (load 8, addrspace 1) dead %20:sreg_64 = V_CMP_GT_I32_e64 4, %9, implicit $exec - GLOBAL_STORE_DWORDX2 %0, %10, 288, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) + GLOBAL_STORE_DWORDX2 %0, %10, 288, 0, 0, 0, 0, implicit $exec :: (store 8, addrspace 1) bb.2: %5:vgpr_32 = COPY %13 @@ -122,7 +122,7 @@ bb.4: dead %21:sreg_64 = COPY $exec - %22:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %22:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) DBG_VALUE %22, $noreg, !16, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !21 bb.5: Index: llvm/test/CodeGen/AMDGPU/dead-lane.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dead-lane.mir +++ llvm/test/CodeGen/AMDGPU/dead-lane.mir @@ -12,7 +12,7 @@ %1:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec %2:vgpr_32 = nofpexcept V_MAC_F32_e32 undef %0:vgpr_32, undef %0:vgpr_32, undef %0:vgpr_32, implicit $mode, implicit $exec %3:vreg_64 = REG_SEQUENCE %1:vgpr_32, %subreg.sub0, %2:vgpr_32, %subreg.sub1 - FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD undef %4:vreg_64, %3.sub0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/dead_copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/dead_copy.mir +++ llvm/test/CodeGen/AMDGPU/dead_copy.mir @@ -23,5 +23,5 @@ $vgpr10 = COPY killed $sgpr14, implicit $exec $vgpr11 = COPY killed $sgpr15, implicit $exec - FLAT_STORE_DWORDX4 $vgpr10_vgpr11, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX4 $vgpr10_vgpr11, $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... Index: llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir +++ llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir @@ -71,7 +71,7 @@ ; CHECK: dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec ; CHECK: dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec ; CHECK: dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec - ; CHECK: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, 0, 0, implicit $exec + ; CHECK: GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, 0, 0, 0, implicit $exec ; CHECK: S_ENDPGM 0 bb.0: successors: %bb.1 @@ -129,7 +129,7 @@ %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %4, 0, %1, 0, 0, implicit $mode, implicit $exec %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %5, 0, %2, 0, 0, implicit $mode, implicit $exec %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %25, 0, %6, 0, %3, 0, 0, implicit $mode, implicit $exec - GLOBAL_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/dpp64_combine.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/dpp64_combine.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10 + +; GCN-LABEL: {{^}}dpp64_ceil: +; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], +; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +define amdgpu_kernel void @dpp64_ceil(i64 addrspace(1)* %arg, i64 %in1) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id + %load = load i64, i64 addrspace(1)* %gep + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 + %tmp1 = bitcast i64 %tmp0 to double + %round = tail call double @llvm.ceil.f64(double %tmp1) + %tmp2 = bitcast double %round to i64 + store i64 %tmp2, i64 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}dpp64_rcp: +; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], +; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +define amdgpu_kernel void @dpp64_rcp(i64 addrspace(1)* %arg, i64 %in1) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id + %load = load i64, i64 addrspace(1)* %gep + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 + %tmp1 = bitcast i64 %tmp0 to double + %rcp = call double @llvm.amdgcn.rcp.f64(double %tmp1) + %tmp2 = bitcast double %rcp to i64 + store i64 %tmp2, i64 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}dpp64_rcp_unsupported_ctl: +; GCN-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +; GCN: v_rcp_f64_e32 +define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(i64 addrspace(1)* %arg, i64 %in1) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id + %load = load i64, i64 addrspace(1)* %gep + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 15, i32 15, i1 1) #0 + %tmp1 = bitcast i64 %tmp0 to double + %rcp = fdiv fast double 1.0, %tmp1 + %tmp2 = bitcast double %rcp to i64 + store i64 %tmp2, i64 addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}dpp64_div: +; GCN: global_load_dwordx2 [[V:v\[[0-9:]+\]]], +; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +; GFX10-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:0{{$}} +; GCN: v_div_scale_f64 +; GCN: v_rcp_f64_e32 +define amdgpu_kernel void @dpp64_div(i64 addrspace(1)* %arg, i64 %in1) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %id + %load = load i64, i64 addrspace(1)* %gep + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 337, i32 15, i32 15, i1 1) #0 + %tmp1 = bitcast i64 %tmp0 to double + %rcp = fdiv double 15.0, %tmp1 + %tmp2 = bitcast double %rcp to i64 + store i64 %tmp2, i64 addrspace(1)* %gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0 +declare double @llvm.ceil.f64(double) +declare double @llvm.amdgcn.rcp.f64(double) + +attributes #0 = { nounwind readnone convergent } Index: llvm/test/CodeGen/AMDGPU/dpp64_combine.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/dpp64_combine.mir @@ -0,0 +1,51 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=GCN + +--- +# GCN-LABEL: name: dpp64_old_impdef +# GCN: %3:vreg_64 = V_CEIL_F64_dpp %1, 0, %0, 337, 15, 15, 1, implicit $mode, implicit $exec +--- +name: dpp64_old_impdef +tracksRegLiveness: true +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 337, 15, 15, 1, implicit $exec + %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dpp64_old_undef +# GCN: %3:vreg_64 = V_CEIL_F64_dpp undef %1:vreg_64, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec +--- +name: dpp64_old_undef +tracksRegLiveness: true +body: | + bb.0: + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec + %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec +... + +# GCN-LABEL: name: dpp64_old_is_0 +# GCN: %3:vreg_64 = V_CEIL_F64_dpp %4, 0, undef %2:vreg_64, 337, 15, 15, 1, implicit $mode, implicit $exec +name: dpp64_old_is_0 +tracksRegLiveness: true +body: | + bb.0: + %1:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1, undef %0:vreg_64, 337, 15, 15, 1, implicit $exec + %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec +... + +# DPP64 does not support all control values and must be split to become legal +# GCN-LABEL: name: dpp64_illegal_ctrl +# GCN: %4:vgpr_32 = V_MOV_B32_dpp undef %1.sub0:vreg_64, undef %2.sub0:vreg_64, 1, 15, 15, 1, implicit $exec +# GCN: %5:vgpr_32 = V_MOV_B32_dpp undef %1.sub1:vreg_64, undef %2.sub1:vreg_64, 1, 15, 15, 1, implicit $exec +# GCN: %0:vreg_64 = REG_SEQUENCE %4, %subreg.sub0, %5, %subreg.sub1 +# GCN: %3:vreg_64 = V_CEIL_F64_e32 %0, implicit $mode, implicit $exec +name: dpp64_illegal_ctrl +tracksRegLiveness: true +body: | + bb.0: + %2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec + %3:vreg_64 = V_CEIL_F64_e32 %2, implicit $mode, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -52,6 +52,7 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx908 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX908 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx90a < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90A %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx90c < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX90C %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1011 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s @@ -109,6 +110,7 @@ ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX908: EF_AMDGPU_MACH_AMDGCN_GFX908 (0x30) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) +; GFX90A: EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F) ; GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32) ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) Index: llvm/test/CodeGen/AMDGPU/endpgm-dce.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/endpgm-dce.mir +++ llvm/test/CodeGen/AMDGPU/endpgm-dce.mir @@ -17,7 +17,7 @@ %0 = IMPLICIT_DEF %3 = IMPLICIT_DEF $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc - %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec %4 = S_ADD_U32 %3, 1, implicit-def $scc S_ENDPGM 0 @@ -25,7 +25,7 @@ --- # GCN-LABEL: name: load_without_memoperand # GCN: $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc -# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # GCN-NEXT: S_ENDPGM 0 name: load_without_memoperand tracksRegLiveness: true @@ -41,7 +41,7 @@ %0 = IMPLICIT_DEF %3 = IMPLICIT_DEF $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc - %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec %4 = S_ADD_U32 %3, 1, implicit-def $scc S_ENDPGM 0 @@ -49,7 +49,7 @@ --- # GCN-LABEL: name: load_volatile # GCN: $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc -# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4) +# GCN-NEXT: dead %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4) # GCN-NEXT: S_ENDPGM 0 name: load_volatile tracksRegLiveness: true @@ -65,7 +65,7 @@ %0 = IMPLICIT_DEF %3 = IMPLICIT_DEF $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc - %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4) + %1 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4) %2 = V_ADD_F32_e64 0, killed %1, 0, 1, 0, 0, implicit $mode, implicit $exec %4 = S_ADD_U32 %3, 1, implicit-def $scc S_ENDPGM 0 @@ -73,7 +73,7 @@ --- # GCN-LABEL: name: store # GCN: $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc -# GCN-NEXT: FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) +# GCN-NEXT: FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) # GCN-NEXT: S_ENDPGM 0 name: store tracksRegLiveness: true @@ -86,7 +86,7 @@ %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF $sgpr0_sgpr1 = S_OR_B64 $exec, killed $vcc, implicit-def $scc - FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) S_ENDPGM 0 ... --- Index: llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir +++ llvm/test/CodeGen/AMDGPU/expand-si-indirect.mir @@ -53,49 +53,49 @@ %28:vgpr_32 = COPY %23.sub13 %29:vgpr_32 = COPY %23.sub12 %30:vreg_128 = REG_SEQUENCE killed %29, %subreg.sub0, killed %28, %subreg.sub1, killed %27, %subreg.sub2, killed %26, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %30, %2, 48, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %30, %2, 48, 0, 0, 0, 0, implicit $exec %31:vgpr_32 = COPY %23.sub11 %32:vgpr_32 = COPY %23.sub10 %33:vgpr_32 = COPY %23.sub9 %34:vgpr_32 = COPY %23.sub8 %35:vreg_128 = REG_SEQUENCE killed %34, %subreg.sub0, killed %33, %subreg.sub1, killed %32, %subreg.sub2, killed %31, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %35, %2, 32, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %35, %2, 32, 0, 0, 0, 0, implicit $exec %36:vgpr_32 = COPY %23.sub7 %37:vgpr_32 = COPY %23.sub6 %38:vgpr_32 = COPY %23.sub5 %39:vgpr_32 = COPY %23.sub4 %40:vreg_128 = REG_SEQUENCE killed %39, %subreg.sub0, killed %38, %subreg.sub1, killed %37, %subreg.sub2, killed %36, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %40, %2, 16, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %40, %2, 16, 0, 0, 0, 0, implicit $exec %41:vgpr_32 = COPY %23.sub3 %42:vgpr_32 = COPY %23.sub2 %43:vgpr_32 = COPY %23.sub1 %44:vgpr_32 = COPY killed %23.sub0 %45:vreg_128 = REG_SEQUENCE killed %44, %subreg.sub0, killed %43, %subreg.sub1, killed %42, %subreg.sub2, killed %41, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %45, %2, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %45, %2, 0, 0, 0, 0, 0, implicit $exec %46:vgpr_32 = COPY %25.sub15 %47:vgpr_32 = COPY %25.sub14 %48:vgpr_32 = COPY %25.sub13 %49:vgpr_32 = COPY %25.sub12 %50:vreg_128 = REG_SEQUENCE killed %49, %subreg.sub0, killed %48, %subreg.sub1, killed %47, %subreg.sub2, killed %46, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %50, %2, 112, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %50, %2, 112, 0, 0, 0, 0, implicit $exec %51:vgpr_32 = COPY %25.sub11 %52:vgpr_32 = COPY %25.sub10 %53:vgpr_32 = COPY %25.sub9 %54:vgpr_32 = COPY %25.sub8 %55:vreg_128 = REG_SEQUENCE killed %54, %subreg.sub0, killed %53, %subreg.sub1, killed %52, %subreg.sub2, killed %51, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %55, %2, 96, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %55, %2, 96, 0, 0, 0, 0, implicit $exec %56:vgpr_32 = COPY %25.sub7 %57:vgpr_32 = COPY %25.sub6 %58:vgpr_32 = COPY %25.sub5 %59:vgpr_32 = COPY %25.sub4 %60:vreg_128 = REG_SEQUENCE killed %59, %subreg.sub0, killed %58, %subreg.sub1, killed %57, %subreg.sub2, killed %56, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR %1, killed %60, %2, 80, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR %1, killed %60, %2, 80, 0, 0, 0, 0, implicit $exec %61:vgpr_32 = COPY %25.sub3 %62:vgpr_32 = COPY %25.sub2 %63:vgpr_32 = COPY %25.sub1 %64:vgpr_32 = COPY killed %25.sub0 %65:vreg_128 = REG_SEQUENCE killed %64, %subreg.sub0, killed %63, %subreg.sub1, killed %62, %subreg.sub2, killed %61, %subreg.sub3 - GLOBAL_STORE_DWORDX4_SADDR killed %1, killed %65, killed %2, 64, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4_SADDR killed %1, killed %65, killed %2, 64, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -12,7 +12,7 @@ ; GCN: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) ; GCN: [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 @@ -21,7 +21,7 @@ ; GCN: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GCN: [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF - ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "BufferResource", align 1, addrspace 4) ; GCN: S_ENDPGM 0 main_body: %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) Index: llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir +++ llvm/test/CodeGen/AMDGPU/fastregalloc-self-loop-heuristic.mir @@ -17,8 +17,8 @@ ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) - ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: S_ENDPGM 0 @@ -27,8 +27,8 @@ %0:vreg_64 = COPY $vgpr0_vgpr1 bb.1: - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_EXECZ %bb.1, implicit $exec bb.2: @@ -52,11 +52,11 @@ ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) - ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec - ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec + ; GCN: renamable $vgpr2 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: S_ENDPGM 0 @@ -65,10 +65,10 @@ %0:vreg_64 = COPY $vgpr0_vgpr1 bb.1: - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_EXECZ %bb.1, implicit $exec bb.2: @@ -96,7 +96,7 @@ ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) ; GCN: renamable $vgpr2 = V_ADD_U32_e32 1, undef $vgpr0, implicit $exec ; GCN: SI_SPILL_V32_SAVE $vgpr2, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: S_ENDPGM 0 @@ -106,7 +106,7 @@ bb.1: %1:vgpr_32 = V_ADD_U32_e32 1, undef %1, implicit $exec - GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_EXECZ %bb.1, implicit $exec bb.2: @@ -130,7 +130,7 @@ ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec ; GCN: renamable $vgpr0 = V_ADD_U32_e64 1, 1, 0, implicit $exec ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec @@ -141,7 +141,7 @@ %0:vreg_64 = COPY $vgpr0_vgpr1 bb.1: - GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, undef %1, 0, 0, 0, 0, 0, implicit $exec %1:vgpr_32 = V_ADD_U32_e64 1, 1, 0, implicit $exec S_CBRANCH_EXECZ %bb.1, implicit $exec @@ -166,8 +166,8 @@ ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN: $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) - ; GCN: undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit-def dead $vgpr2_vgpr3 - ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: undef renamable $vgpr3 = GLOBAL_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit-def dead $vgpr2_vgpr3 + ; GCN: GLOBAL_STORE_DWORD renamable $vgpr0_vgpr1, undef renamable $vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_CBRANCH_EXECZ %bb.1, implicit $exec ; GCN: bb.2: ; GCN: S_ENDPGM 0 @@ -176,8 +176,8 @@ %0:vreg_64 = COPY $vgpr0_vgpr1 bb.1: - undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, implicit $exec + undef %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, undef %1.sub1, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_EXECZ %bb.1, implicit $exec bb.2: Index: llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir +++ llvm/test/CodeGen/AMDGPU/flat-load-clustering.mir @@ -61,17 +61,17 @@ undef %12.sub0 = V_ADD_CO_U32_e32 %4.sub0, %7, implicit-def $vcc, implicit $exec %11 = COPY %4.sub1 %12.sub1 = V_ADDC_U32_e32 %11, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec - %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep1) + %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep1) undef %9.sub0 = V_ADD_CO_U32_e32 %3.sub0, %7, implicit-def $vcc, implicit $exec %8 = COPY %3.sub1 %9.sub1 = V_ADDC_U32_e32 %8, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec undef %13.sub0 = V_ADD_CO_U32_e32 16, %12.sub0, implicit-def $vcc, implicit $exec %13.sub1 = V_ADDC_U32_e32 %12.sub1, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec - %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep34) + %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.gep34) undef %10.sub0 = V_ADD_CO_U32_e32 16, %9.sub0, implicit-def $vcc, implicit $exec %10.sub1 = V_ADDC_U32_e32 %9.sub1, %2, implicit-def dead $vcc, implicit killed $vcc, implicit $exec - FLAT_STORE_DWORD %9, %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep2) - FLAT_STORE_DWORD %10, %6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep4) + FLAT_STORE_DWORD %9, %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep2) + FLAT_STORE_DWORD %10, %6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.gep4) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir +++ llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir @@ -9,10 +9,10 @@ bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0 ... @@ -25,10 +25,10 @@ bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_load_sgpr ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 - ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:sgpr_32 = S_MOV_B32 %stack.0 - %1:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + %1:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %0:sgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) S_ENDPGM 0 ... @@ -42,11 +42,11 @@ ; GCN-LABEL: name: test_fold_fi_scratch_store_vgpr ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = IMPLICIT_DEF - SCRATCH_STORE_DWORD %1:vgpr_32, %0:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD %1:vgpr_32, %0:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) S_ENDPGM 0 ... @@ -60,11 +60,11 @@ ; GCN-LABEL: name: test_no_fold_fi_scratch_store_vgpr ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: SCRATCH_STORE_DWORD [[V_MOV_B32_e32_]], [[DEF]], 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: SCRATCH_STORE_DWORD [[V_MOV_B32_e32_]], [[DEF]], 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = IMPLICIT_DEF - SCRATCH_STORE_DWORD %0:vgpr_32, %1:vgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD %0:vgpr_32, %1:vgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) S_ENDPGM 0 ... @@ -78,11 +78,11 @@ ; GCN-LABEL: name: test_fold_fi_scratch_store_sgpr ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:sgpr_32 = S_MOV_B32 %stack.0 %1:vgpr_32 = IMPLICIT_DEF - SCRATCH_STORE_DWORD_SADDR %1:vgpr_32, %0:sgpr_32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + SCRATCH_STORE_DWORD_SADDR %1:vgpr_32, %0:sgpr_32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/fma.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fma.f64.ll +++ llvm/test/CodeGen/AMDGPU/fma.f64.ll @@ -1,5 +1,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s declare double @llvm.fma.f64(double, double, double) nounwind readnone declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone @@ -8,6 +9,7 @@ ; FUNC-LABEL: {{^}}fma_f64: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -21,6 +23,8 @@ ; FUNC-LABEL: {{^}}fma_v2f64: ; SI: v_fma_f64 ; SI: v_fma_f64 +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1, <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) { %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1 @@ -36,6 +40,10 @@ ; SI: v_fma_f64 ; SI: v_fma_f64 ; SI: v_fma_f64 +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1, <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) { %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1 @@ -48,6 +56,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_src0: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -61,6 +70,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_src1: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -74,6 +84,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_src2: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} +; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -87,6 +98,7 @@ ; FUNC-LABEL: {{^}}fma_f64_neg_src0: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -100,6 +112,7 @@ ; FUNC-LABEL: {{^}}fma_f64_neg_src1: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -113,6 +126,7 @@ ; FUNC-LABEL: {{^}}fma_f64_neg_src2: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -126,6 +140,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -140,6 +155,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -154,6 +170,7 @@ ; FUNC-LABEL: {{^}}fma_f64_abs_neg_src2: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} +; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}} define amdgpu_kernel void @fma_f64_abs_neg_src2(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -168,6 +185,7 @@ ; FUNC-LABEL: {{^}}fma_f64_lit_src0: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_lit_src0(double addrspace(1)* %out, double addrspace(1)* %in2, double addrspace(1)* %in3) { %r1 = load double, double addrspace(1)* %in2 @@ -179,6 +197,7 @@ ; FUNC-LABEL: {{^}}fma_f64_lit_src1: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} +; GFX90A: v_fmac_f64_e32 {{v\[[0-9]+:[0-9]+\], 2.0, v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fma_f64_lit_src1(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in3) { %r0 = load double, double addrspace(1)* %in1 @@ -190,6 +209,7 @@ ; FUNC-LABEL: {{^}}fma_f64_lit_src2: ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}} +; GFX90A: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], 2.0}} define amdgpu_kernel void @fma_f64_lit_src2(double addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double, double addrspace(1)* %in1 Index: llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -22,13 +22,13 @@ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %1:sreg_32_xm0 = S_MOV_B32 0 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -54,12 +54,12 @@ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -83,16 +83,16 @@ ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec %2:sreg_32_xm0 = S_MOV_B32 0 - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 S_ENDPGM 0, implicit $vgpr0 @@ -115,15 +115,15 @@ ; GCN-LABEL: name: kernel_fold_fi_mubuf ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -152,13 +152,13 @@ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %1:sreg_32_xm0 = S_MOV_B32 0 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -185,12 +185,12 @@ ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -214,15 +214,15 @@ ; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -246,15 +246,15 @@ ; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -278,15 +278,15 @@ ; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -17,7 +17,7 @@ %4:vgpr_32 = V_LSHLREV_B32_e64 killed %3, %0, implicit $exec %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %6:vreg_64 = REG_SEQUENCE killed %4, %subreg.sub0, killed %5, %subreg.sub1 - %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %8:sreg_32_xm0 = S_MOV_B32 65535 %9:vgpr_32 = COPY %8 %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec Index: llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -158,10 +158,10 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %12 = V_MOV_B32_e32 1065353216, implicit $exec %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -222,13 +222,13 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1065353216, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -289,14 +289,14 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -360,16 +360,16 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -427,13 +427,13 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $mode, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -494,16 +494,16 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 -2, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $mode, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $mode, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -564,13 +564,13 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 15360, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -631,13 +631,13 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 80886784, implicit $exec %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -697,13 +697,13 @@ %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 305413120, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $mode, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ llvm/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -60,13 +60,13 @@ %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -131,13 +131,13 @@ %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = nofpexcept V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $mode, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -202,13 +202,13 @@ %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = nofpexcept V_MAD_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $mode, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -273,13 +273,13 @@ %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = nofpexcept V_MAD_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $mode, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/fold-multiple.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fold-multiple.mir +++ llvm/test/CodeGen/AMDGPU/fold-multiple.mir @@ -34,7 +34,7 @@ %3 = S_LSHL_B32 %1, killed %1, implicit-def dead $scc %4 = V_AND_B32_e64 killed %2, killed %3, implicit $exec %5 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir +++ llvm/test/CodeGen/AMDGPU/fp-atomic-to-s_denormmode.mir @@ -8,7 +8,7 @@ name: flat_atomic_fcmpswap_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -20,7 +20,7 @@ name: flat_atomic_fcmpswap_x2_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_128, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_128, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -32,7 +32,7 @@ name: flat_atomic_fmax_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -44,7 +44,7 @@ name: flat_atomic_fmax_x2_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -56,7 +56,7 @@ name: flat_atomic_fmin_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -68,7 +68,7 @@ name: flat_atomic_fmin_x2_to_s_denorm_mode body: | bb.0: - FLAT_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -80,7 +80,7 @@ name: flat_atomic_fcmpswap_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = FLAT_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_128, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = FLAT_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_128, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -92,7 +92,7 @@ name: flat_atomic_fmax_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = FLAT_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = FLAT_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -104,7 +104,7 @@ name: flat_atomic_fmax_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = FLAT_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = FLAT_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -116,7 +116,7 @@ name: flat_atomic_fmin_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = FLAT_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = FLAT_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -128,7 +128,7 @@ name: flat_atomic_fmin_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = FLAT_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = FLAT_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -140,7 +140,7 @@ name: flat_atomic_fcmpswap_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = FLAT_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = FLAT_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -152,7 +152,7 @@ name: global_atomic_fcmpswap_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FCMPSWAP undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -164,7 +164,7 @@ name: global_atomic_fcmpswap_x2_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FCMPSWAP_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -176,7 +176,7 @@ name: global_atomic_fmax_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FMAX undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -188,7 +188,7 @@ name: global_atomic_fmax_x2_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FMAX_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -200,7 +200,7 @@ name: global_atomic_fmin_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -212,7 +212,7 @@ name: global_atomic_fmin_x2_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FMIN_X2 undef %0:vreg_64, undef %1:vreg_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -224,7 +224,7 @@ name: global_atomic_fcmpswap_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = GLOBAL_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = GLOBAL_ATOMIC_FCMPSWAP_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -236,7 +236,7 @@ name: global_atomic_fcmpswap_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -248,7 +248,7 @@ name: global_atomic_fmax_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -260,7 +260,7 @@ name: global_atomic_fmax_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -272,7 +272,7 @@ name: global_atomic_fmin_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN undef %0:vreg_64, undef %1:vgpr_32, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -284,7 +284,7 @@ name: global_atomic_fmin_x2_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_RTN undef %0:vreg_64, undef %1:vreg_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -296,7 +296,7 @@ name: global_atomic_fcmpswap_saddr_to_s_denorm_mode body: | bb.0: - GLOBAL_ATOMIC_FCMPSWAP_SADDR undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + GLOBAL_ATOMIC_FCMPSWAP_SADDR undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -308,7 +308,7 @@ name: global_atomic_fcmpswap_x2_saddr_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FCMPSWAP_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -320,7 +320,7 @@ name: global_atomic_fmax_saddr_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = GLOBAL_ATOMIC_FMAX_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -332,7 +332,7 @@ name: global_atomic_fmax_x2_saddr_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FMAX_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -344,7 +344,7 @@ name: global_atomic_fmin_saddr_rtn_to_s_denorm_mode body: | bb.0: - %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vgpr_32 = GLOBAL_ATOMIC_FMIN_SADDR_RTN undef %0:vgpr_32, undef %1:vgpr_32, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -356,7 +356,7 @@ name: global_atomic_fmin_x2_saddr_rtn_to_s_denorm_mode body: | bb.0: - %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + %2:vreg_64 = GLOBAL_ATOMIC_FMIN_X2_SADDR_RTN undef %0:vgpr_32, undef %1:vreg_64, undef %3:sgpr_64, 0, -1, 0, 0, implicit $exec :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -368,7 +368,7 @@ name: flat_fp_atomic_to_s_denorm_mode_waitcnt body: | bb.0: - FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) S_WAITCNT 0 S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... @@ -381,7 +381,7 @@ name: flat_fp_atomic_to_s_denorm_mode_valu body: | bb.0: - FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) + FLAT_ATOMIC_FMIN undef %0:vreg_64, undef %1:vgpr_32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst seq_cst 4 on `float addrspace(1)* undef`) %2:vgpr_32 = V_ADD_F32_e32 undef %1:vgpr_32, undef %1:vgpr_32, implicit $mode, implicit $exec S_DENORM_MODE 0, implicit-def $mode, implicit $mode ... Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -0,0 +1,737 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A + +declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1) +declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) +declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg) +declare double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) +declare double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* nocapture, double, i32, i32, i1) + +define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret void +} + +define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 4 offen glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v2, s8 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) { +; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + store double %ret, double* undef + ret void +} + +define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(1)* %out) { +; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_mov_b32_e32 v3, s10 +; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v3, s[4:7], 0 idxen offset:4 glc slc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) + store double %ret, double addrspace(1)* %out, align 8 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmin_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmax_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)* %ptr) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspace(1)* %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: BB25_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_mov_b32_e32 v4, v0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v6, v[2:5], s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB25_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + ret void +} + +define double @global_atomic_fadd_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double %data) #1 { +; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst + ret double %ret +} + +define double @global_atomic_fmax_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmax_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define double @global_atomic_fmin_f64_rtn(double addrspace(1)* %ptr, double %data) { +; GFX90A-LABEL: global_atomic_fmin_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + ret void +} + +define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 { +; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fadd_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fmin_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmin_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fmin_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmin_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @flat_atomic_fmax_f64_noret(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmax_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) + ret void +} + +define double @flat_atomic_fmax_f64_rtn(double* %ptr, double %data) { +; GFX90A-LABEL: flat_atomic_fmax_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %ptr, double %data) + ret double %ret +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_noret: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_endpgm +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret void +} + +define double @local_atomic_fadd_f64_rtn(double addrspace(3)* %ptr, double %data) { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, v1 +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(double addrspace(3)* %ptr) #1 { +; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspace(3)* %ptr) #0 { +; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: ds_read_b64 v[0:1], v0 +; GFX90A-NEXT: BB41_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB41_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret void +} + +define double @local_atomic_fadd_f64_rtn_pat(double addrspace(3)* %ptr, double %data) #1 { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret double %ret +} + +attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } +attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } Index: llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll @@ -0,0 +1,22 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s + +; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding: +; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding: +; GFX908-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x0.,}} +; GFX90A-DAG: v_mfma_i32_4x4x4i8 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] ; encoding: [{{0x..,0x8.,}} +define amdgpu_kernel void @test(<4 x i32> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %r1 = tail call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %x, i32 %id, i32 0, i1 zeroext false, i1 zeroext false) + store volatile <4 x float> %r1, <4 x float>* undef + %r2 = tail call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %x, i32 %id, i32 0, i1 zeroext false, i1 zeroext false) + store volatile <4 x half> %r2, <4 x half>* undef + %r3 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %x, i32 0, i32 0, i32 0) + store <4 x i32> %r3, <4 x i32>* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1 immarg, i1 immarg) +declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1,65 +1,346 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s -; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32: -; GCN: [[LOOP:BB[0-9]+_[0-9]+]] -; GCN: v_add_f32_e32 -; GCN: global_atomic_cmpswap -; GCN: s_andn2_b64 exec, exec, -; GCN-NEXT: s_cbranch_execnz [[LOOP]] define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { +; GFX900-LABEL: global_atomic_fadd_ret_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: BB0_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB0_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_ret_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: BB0_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_cbranch_execnz BB0_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_ret_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } -; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_ieee: -; GCN: [[LOOP:BB[0-9]+_[0-9]+]] -; GCN: v_add_f32_e32 -; GCN: global_atomic_cmpswap -; GCN: s_andn2_b64 exec, exec, -; GCN-NEXT: s_cbranch_execnz [[LOOP]] -define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) { +define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 { +; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: BB1_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB1_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX900-NEXT: global_store_dword v[0:1], v0, off +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: BB1_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_cbranch_execnz BB1_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX908-NEXT: global_store_dword v[0:1], v0, off +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: BB1_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB1_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } -; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32: -; GFX900: [[LOOP:BB[0-9]+_[0-9]+]] -; GFX900: v_add_f32_e32 -; GFX900: global_atomic_cmpswap -; GFX900: s_andn2_b64 exec, exec, -; GFX900-NEXT: s_cbranch_execnz [[LOOP]] - -; GFX908-NOT: v_add_f32 -; GFX908: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s -; GFX908-NOT: s_cbranch_execnz define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 { +; GFX900-LABEL: global_atomic_fadd_noret_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: BB2_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB2_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_noret_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_noret_f32: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_ieee: -; GCN: global_atomic_cmpswap -define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) { +define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 { +; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: BB3_1: ; %atomicrmw.start +; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_wbinvl1_vol +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX900-NEXT: s_cbranch_execnz BB3_1 +; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX900-NEXT: s_endpgm +; +; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GFX908-NEXT: BB3_1: ; %atomicrmw.start +; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX908-NEXT: s_cbranch_execnz BB3_1 +; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: BB3_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_atomic_cmpswap v1, v1, v[2:3], s[0:1] glc scc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v0 +; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_cbranch_execnz BB3_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } -; Make sure this artificially selects with an incorrect subtarget, but the feature set. -; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_wrong_subtarget: define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { +; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: BB4_1: ; %atomicrmw.start +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_mov_b32_e32 v1, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GCN-NEXT: s_cbranch_execnz BB4_1 +; GCN-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } -; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_wrong_subtarget: define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { +; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 4.0 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } -attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" } +attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" } Index: llvm/test/CodeGen/AMDGPU/hard-clauses.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hard-clauses.mir +++ llvm/test/CodeGen/AMDGPU/hard-clauses.mir @@ -44,168 +44,168 @@ ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 ; CHECK: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit-def $vgpr6, implicit-def $vgpr6_lo16, implicit-def $vgpr6_hi16, implicit-def $vgpr7, implicit-def $vgpr7_lo16, implicit-def $vgpr7_hi16, implicit-def $vgpr8, implicit-def $vgpr8_lo16, implicit-def $vgpr8_hi16, implicit-def $vgpr9, implicit-def $vgpr9_lo16, implicit-def $vgpr9_hi16, implicit-def $vgpr10, implicit-def $vgpr10_lo16, implicit-def $vgpr10_hi16, implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit-def $vgpr13, implicit-def $vgpr13_lo16, implicit-def $vgpr13_hi16, implicit-def $vgpr14, implicit-def $vgpr14_lo16, implicit-def $vgpr14_hi16, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit-def $vgpr17, implicit-def $vgpr17_lo16, implicit-def $vgpr17_hi16, implicit-def $vgpr18, implicit-def $vgpr18_lo16, implicit-def $vgpr18_hi16, implicit-def $vgpr19, implicit-def $vgpr19_lo16, implicit-def $vgpr19_hi16, implicit-def $vgpr20, implicit-def $vgpr20_lo16, implicit-def $vgpr20_hi16, implicit-def $vgpr21, implicit-def $vgpr21_lo16, implicit-def $vgpr21_hi16, implicit-def $vgpr22, implicit-def $vgpr22_lo16, implicit-def $vgpr22_hi16, implicit-def $vgpr23, implicit-def $vgpr23_lo16, implicit-def $vgpr23_hi16, implicit-def $vgpr24, implicit-def $vgpr24_lo16, implicit-def $vgpr24_hi16, implicit-def $vgpr25, implicit-def $vgpr25_lo16, implicit-def $vgpr25_hi16, implicit-def $vgpr26, implicit-def $vgpr26_lo16, implicit-def $vgpr26_hi16, implicit-def $vgpr27, implicit-def $vgpr27_lo16, implicit-def $vgpr27_hi16, implicit-def $vgpr28, implicit-def $vgpr28_lo16, implicit-def $vgpr28_hi16, implicit-def $vgpr29, implicit-def $vgpr29_lo16, implicit-def $vgpr29_hi16, implicit-def $vgpr30, implicit-def $vgpr30_lo16, implicit-def $vgpr30_hi16, implicit-def $vgpr31, implicit-def $vgpr31_lo16, implicit-def $vgpr31_hi16, implicit-def $vgpr32, implicit-def $vgpr32_lo16, implicit-def $vgpr32_hi16, implicit-def $vgpr33, implicit-def $vgpr33_lo16, implicit-def $vgpr33_hi16, implicit-def $vgpr34, implicit-def $vgpr34_lo16, implicit-def $vgpr34_hi16, implicit-def $vgpr35, implicit-def $vgpr35_lo16, implicit-def $vgpr35_hi16, implicit-def $vgpr36, implicit-def $vgpr36_lo16, implicit-def $vgpr36_hi16, implicit-def $vgpr37, implicit-def $vgpr37_lo16, implicit-def $vgpr37_hi16, implicit-def $vgpr38, implicit-def $vgpr38_lo16, implicit-def $vgpr38_hi16, implicit-def $vgpr39, implicit-def $vgpr39_lo16, implicit-def $vgpr39_hi16, implicit-def $vgpr40, implicit-def $vgpr40_lo16, implicit-def $vgpr40_hi16, implicit-def $vgpr41, implicit-def $vgpr41_lo16, implicit-def $vgpr41_hi16, implicit-def $vgpr42, implicit-def $vgpr42_lo16, implicit-def $vgpr42_hi16, implicit-def $vgpr43, implicit-def $vgpr43_lo16, implicit-def $vgpr43_hi16, implicit-def $vgpr44, implicit-def $vgpr44_lo16, implicit-def $vgpr44_hi16, implicit-def $vgpr45, implicit-def $vgpr45_lo16, implicit-def $vgpr45_hi16, implicit-def $vgpr46, implicit-def $vgpr46_lo16, implicit-def $vgpr46_hi16, implicit-def $vgpr47, implicit-def $vgpr47_lo16, implicit-def $vgpr47_hi16, implicit-def $vgpr48, implicit-def $vgpr48_lo16, implicit-def $vgpr48_hi16, implicit-def $vgpr49, implicit-def $vgpr49_lo16, implicit-def $vgpr49_hi16, implicit-def $vgpr50, implicit-def $vgpr50_lo16, implicit-def $vgpr50_hi16, implicit-def $vgpr51, implicit-def $vgpr51_lo16, implicit-def $vgpr51_hi16, implicit-def $vgpr52, implicit-def $vgpr52_lo16, implicit-def $vgpr52_hi16, implicit-def $vgpr53, implicit-def $vgpr53_lo16, implicit-def $vgpr53_hi16, implicit-def $vgpr54, implicit-def $vgpr54_lo16, implicit-def $vgpr54_hi16, implicit-def $vgpr55, implicit-def $vgpr55_lo16, implicit-def $vgpr55_hi16, implicit-def $vgpr56, implicit-def $vgpr56_lo16, implicit-def $vgpr56_hi16, implicit-def $vgpr57, implicit-def $vgpr57_lo16, implicit-def $vgpr57_hi16, implicit-def $vgpr58, implicit-def $vgpr58_lo16, implicit-def $vgpr58_hi16, implicit-def $vgpr59, implicit-def $vgpr59_lo16, implicit-def $vgpr59_hi16, implicit-def $vgpr60, implicit-def $vgpr60_lo16, implicit-def $vgpr60_hi16, implicit-def $vgpr61, implicit-def $vgpr61_lo16, implicit-def $vgpr61_hi16, implicit-def $vgpr62, implicit-def $vgpr62_lo16, implicit-def $vgpr62_hi16, implicit-def $vgpr63, implicit-def $vgpr63_lo16, implicit-def $vgpr63_hi16, implicit-def $vgpr64, implicit-def $vgpr64_lo16, implicit-def $vgpr64_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec { ; CHECK: S_CLAUSE 63 - ; CHECK: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: } ; CHECK: BUNDLE implicit-def $vgpr65, implicit-def $vgpr65_lo16, implicit-def $vgpr65_hi16, implicit-def $vgpr66, implicit-def $vgpr66_lo16, implicit-def $vgpr66_hi16, implicit-def $vgpr67, implicit-def $vgpr67_lo16, implicit-def $vgpr67_hi16, implicit-def $vgpr68, implicit-def $vgpr68_lo16, implicit-def $vgpr68_hi16, implicit-def $vgpr69, implicit-def $vgpr69_lo16, implicit-def $vgpr69_hi16, implicit-def $vgpr70, implicit-def $vgpr70_lo16, implicit-def $vgpr70_hi16, implicit-def $vgpr71, implicit-def $vgpr71_lo16, implicit-def $vgpr71_hi16, implicit-def $vgpr72, implicit-def $vgpr72_lo16, implicit-def $vgpr72_hi16, implicit-def $vgpr73, implicit-def $vgpr73_lo16, implicit-def $vgpr73_hi16, implicit-def $vgpr74, implicit-def $vgpr74_lo16, implicit-def $vgpr74_hi16, implicit-def $vgpr75, implicit-def $vgpr75_lo16, implicit-def $vgpr75_hi16, implicit-def $vgpr76, implicit-def $vgpr76_lo16, implicit-def $vgpr76_hi16, implicit-def $vgpr77, implicit-def $vgpr77_lo16, implicit-def $vgpr77_hi16, implicit-def $vgpr78, implicit-def $vgpr78_lo16, implicit-def $vgpr78_hi16, implicit-def $vgpr79, implicit-def $vgpr79_lo16, implicit-def $vgpr79_hi16, implicit-def $vgpr80, implicit-def $vgpr80_lo16, implicit-def $vgpr80_hi16, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec { ; CHECK: S_CLAUSE 15 - ; CHECK: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec - ; CHECK: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: } - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, implicit $exec - $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec - $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, implicit $exec - $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, implicit $exec - $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, implicit $exec - $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, implicit $exec - $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, implicit $exec - $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, implicit $exec - $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, implicit $exec - $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, implicit $exec - $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, implicit $exec - $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, implicit $exec - $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, implicit $exec - $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, implicit $exec - $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, implicit $exec - $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, implicit $exec - $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, implicit $exec - $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, implicit $exec - $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, implicit $exec - $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, implicit $exec - $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, implicit $exec - $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, implicit $exec - $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, implicit $exec - $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, implicit $exec - $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, implicit $exec - $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, implicit $exec - $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, implicit $exec - $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, implicit $exec - $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, implicit $exec - $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, implicit $exec - $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, implicit $exec - $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, implicit $exec - $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, implicit $exec - $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, implicit $exec - $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, implicit $exec - $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, implicit $exec - $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, implicit $exec - $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, implicit $exec - $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, implicit $exec - $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, implicit $exec - $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, implicit $exec - $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, implicit $exec - $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, implicit $exec - $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, implicit $exec - $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, implicit $exec - $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, implicit $exec - $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, implicit $exec - $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, implicit $exec - $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, implicit $exec - $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, implicit $exec - $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, implicit $exec - $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, implicit $exec - $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, implicit $exec - $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, implicit $exec - $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, implicit $exec - $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, implicit $exec - $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, implicit $exec - $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, implicit $exec - $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, implicit $exec - $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, implicit $exec - $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, implicit $exec - $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, implicit $exec - $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, implicit $exec - $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, implicit $exec - $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, implicit $exec - $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, implicit $exec - $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, implicit $exec - $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, implicit $exec - $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, implicit $exec - $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, implicit $exec - $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, implicit $exec - $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, implicit $exec - $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, implicit $exec - $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, implicit $exec - $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, implicit $exec - $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, implicit $exec - $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, implicit $exec - $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr5 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr6 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 24, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr7 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 28, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr8 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr9 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 36, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr10 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 40, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr11 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 44, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr12 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr13 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 52, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr14 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 56, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr15 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 60, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr16 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr17 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr18 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 72, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr19 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 76, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr20 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr21 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 84, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr22 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 88, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr23 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 92, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr24 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr25 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 100, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr26 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 104, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr27 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 108, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr28 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr29 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 116, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr30 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 120, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr31 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 124, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr32 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr33 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 132, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr34 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 136, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr35 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 140, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr36 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr37 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 148, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr38 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 152, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr39 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 156, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr40 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr41 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 164, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr42 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 168, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr43 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 172, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr44 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr45 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 180, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr46 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 184, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr47 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 188, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr48 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr49 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 196, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr50 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 200, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr51 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 204, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr52 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr53 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 212, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr54 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 216, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr55 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 220, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr56 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr57 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 228, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr58 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 232, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr59 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 236, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr60 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr61 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 244, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr62 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 248, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr63 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 252, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr64 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr65 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 260, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr66 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 264, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr67 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 268, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr68 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr69 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 276, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr70 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 280, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr71 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 284, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr72 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr73 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 292, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr74 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 296, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr75 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 300, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr76 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr77 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 308, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr78 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 312, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr79 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 316, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr80 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir +++ llvm/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir @@ -12,7 +12,7 @@ bb.0.entry: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $mode, implicit $m0, implicit $exec S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ llvm/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -54,7 +54,7 @@ BUNDLE implicit-def $sgpr0_sgpr1, implicit $sgpr10_sgpr11 { $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 } - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -84,5 +84,5 @@ } bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -78,7 +78,7 @@ $vgpr0 = IMPLICIT_DEF $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec } S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir +++ llvm/test/CodeGen/AMDGPU/hazard-inlineasm.mir @@ -16,7 +16,7 @@ body: | bb.0: - FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr INLINEASM &"v_mad_u64_u32 $0, $1, $2, $3, $4", 0, 2621450, def $vgpr26_vgpr27, 2818058, def dead $sgpr14_sgpr15, 589833, $sgpr12, 327689, killed $vgpr51, 2621449, $vgpr46_vgpr47 S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir +++ llvm/test/CodeGen/AMDGPU/hazard-pass-ordering.mir @@ -15,7 +15,7 @@ $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec $sgpr8_sgpr9 = S_MOV_B64 -1 - $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr8_sgpr9, implicit-def dead $scc S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc Index: llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir +++ llvm/test/CodeGen/AMDGPU/hazard-recognizer-meta-insts.mir @@ -12,11 +12,11 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-LABEL: name: global_store_dwordx4_data_hazard_kill - ; GFX9: GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec ; GFX9: $vgpr2 = KILL ; GFX9: S_NOP 0 ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec - GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = KILL $vgpr2 = V_MOV_B32_e32 0, implicit $exec @@ -30,11 +30,11 @@ bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4 ; GFX9-LABEL: name: global_store_dwordx3_data_hazard_kill - ; GFX9: GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GFX9: GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GFX9: $vgpr2 = KILL ; GFX9: S_NOP 0 ; GFX9: $vgpr2 = V_MOV_B32_e32 0, implicit $exec - GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = KILL $vgpr2 = V_MOV_B32_e32 0, implicit $exec Index: llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -99,7 +99,7 @@ ; GCN: bb.2: ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 4 from %stack.6, addrspace 5) ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.2, align 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -7,9 +7,9 @@ define amdgpu_kernel void @s_input_output_i128() { ; CHECK-LABEL: name: s_input_output_i128 ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:SGPR_128 */, def %4 + ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 4128778 /* regdef:SGPR_128 */, def %4 ; CHECK: [[COPY:%[0-9]+]]:sgpr_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:SGPR_128 */, [[COPY]] + ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4128777 /* reguse:SGPR_128 */, [[COPY]] ; CHECK: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -19,9 +19,9 @@ define amdgpu_kernel void @v_input_output_i128() { ; CHECK-LABEL: name: v_input_output_i128 ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_128 */, def %4 + ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_128 */, def %4 ; CHECK: [[COPY:%[0-9]+]]:vreg_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3866633 /* reguse:VReg_128 */, [[COPY]] + ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_128 */, [[COPY]] ; CHECK: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -31,9 +31,9 @@ define amdgpu_kernel void @a_input_output_i128() { ; CHECK-LABEL: name: a_input_output_i128 ; CHECK: bb.0 (%ir-block.0): - ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3801098 /* regdef:AReg_128 */, def %4 + ; CHECK: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3932170 /* regdef:AReg_128 */, def %4 ; CHECK: [[COPY:%[0-9]+]]:areg_128 = COPY %4 - ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AReg_128 */, [[COPY]] + ; CHECK: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3932169 /* reguse:AReg_128 */, [[COPY]] ; CHECK: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) Index: llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir +++ llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir @@ -13,7 +13,7 @@ ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK: FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; CHECK: bb.2: ; CHECK: S_ENDPGM 0 bb.0: @@ -23,7 +23,7 @@ bb.1: successors: %bb.2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr bb.2: S_ENDPGM 0 @@ -41,7 +41,7 @@ ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: bb.2: ; CHECK: S_ENDPGM 0 bb.0: @@ -51,7 +51,7 @@ bb.1: successors: %bb.2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir +++ llvm/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir @@ -49,10 +49,10 @@ bb.0 (%ir-block.2): $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec Index: llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -190,32 +190,32 @@ body: | bb.0: - BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec BUFFER_ATOMIC_CMPSWAP_X2_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.1 bb.1: - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = V_MOV_B32_e32 0, implicit $exec - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = V_MOV_B32_e32 0, implicit $exec - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = V_MOV_B32_e32 0, implicit $exec - FLAT_ATOMIC_CMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_CMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = V_MOV_B32_e32 0, implicit $exec - FLAT_ATOMIC_FCMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_FCMPSWAP_X2 $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit $exec, implicit $flat_scr $vgpr3 = V_MOV_B32_e32 0, implicit $exec S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -64,7 +64,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec S_BRANCH %bb.3 @@ -72,7 +72,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.3.done: @@ -80,7 +80,7 @@ $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ llvm/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -12,7 +12,7 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -24,7 +24,7 @@ body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,11 +56,11 @@ body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -75,7 +75,7 @@ $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -87,7 +87,7 @@ body: | bb.0: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -101,7 +101,7 @@ bb.0: successors: %bb.0 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... @@ -118,8 +118,8 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -137,7 +137,7 @@ bb.1: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -150,11 +150,11 @@ bb.0: successors: %bb.1 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -171,7 +171,7 @@ bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -189,7 +189,7 @@ bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -206,7 +206,7 @@ bb.1: S_WAITCNT_VSCNT undef $sgpr0, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -223,7 +223,7 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -239,7 +239,7 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -255,7 +255,7 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -271,6 +271,6 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/limit-coalesce.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -57,15 +57,15 @@ %4.sub1 = COPY %3.sub0 undef %5.sub0 = COPY %4.sub1 %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %6 = IMPLICIT_DEF undef %7.sub0_sub1 = COPY %6 %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %8 = IMPLICIT_DEF undef %9.sub0_sub1_sub2 = COPY %8 %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll @@ -0,0 +1,93 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A +; RUN: not --crash llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908 + +declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) +declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) +declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)*, float) +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)*, <2 x half>) + +; GFX908: error: {{.*}} return versions of fp atomics not supported + +; GFX90A-LABEL: {{^}}buffer_atomic_add_f32: +; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc +define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret float %ret +} + +; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc: +; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc +define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + ret float %ret +} + +; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16: +; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc +define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret <2 x half> %ret +} + +; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc: +; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc +define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) { +main_body: + %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + ret <2 x half> %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_add_f32: +; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc +define amdgpu_ps float @global_atomic_add_f32(float addrspace(1)* %ptr, float %data) { +main_body: + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret float %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4: +; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc +define amdgpu_ps float @global_atomic_add_f32_off4(float addrspace(1)* %ptr, float %data) { +main_body: + %p = getelementptr float, float addrspace(1)* %ptr, i64 1 + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) + ret float %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4: +; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc +define amdgpu_ps float @global_atomic_add_f32_offneg4(float addrspace(1)* %ptr, float %data) { +main_body: + %p = getelementptr float, float addrspace(1)* %ptr, i64 -1 + %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %p, float %data) + ret float %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16: +; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc +define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +main_body: + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + ret <2 x half> %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4: +; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc +define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +main_body: + %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 1 + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + ret <2 x half> %ret +} + +; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4: +; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc +define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +main_body: + %p = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -1 + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16.v2f16(<2 x half> addrspace(1)* %p, <2 x half> %data) + ret <2 x half> %ret +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GCN declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX6 %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX101 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX6789 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX6789 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s ; GCN-LABEL: {{^}}atomic_swap_1d: Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -0,0 +1,306 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}load_1d: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_lwe: +; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} +define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2d: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_3d: +; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_cube: +; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_cube_lwe: +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1darray: +; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darray: +; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darray_lwe: +; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2dmsaa: +; GCN: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa: +; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}store_1d: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2d: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { +main_body: + call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_3d: +; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) { +main_body: + call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_cube: +; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_1darray: +; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2darray: +; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { +main_body: + call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2dmsaa: +; GCN: image_store v[0:3], v[4:6], s[0:7] dmask:0xf unorm{{$}} +define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) { +main_body: + call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_2darraymsaa: +; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}load_1d_V1: +; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} +define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +; GCN-LABEL: {{^}}load_1d_V2: +; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} +define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +; GCN-LABEL: {{^}}store_1d_V1: +; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} +define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}store_1d_V2: +; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} +define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +; GCN-LABEL: {{^}}load_1d_glc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} +define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_slc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} +define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_glc_slc: +; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} +define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}store_1d_glc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} +define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) + ret void +} + +; GCN-LABEL: {{^}}store_1d_slc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} +define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret void +} + +; GCN-LABEL: {{^}}store_1d_glc_slc: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} +define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) + ret void +} + +; GCN-LABEL: {{^}}image_store_wait: +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf +; SI: s_waitcnt expcnt(0) +; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf +; GCN: s_waitcnt vmcnt(0) +; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf +define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { +main_body: + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0) + %data = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %arg4, <8 x i32> %arg1, i32 0, i32 0) + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %data, i32 15, i32 %arg4, <8 x i32> %arg2, i32 0, i32 0) + ret void +} + +; GCN-LABEL: image_load_mmo +; GCN: image_load v1, v[4:5], s[0:7] dmask:0x1 unorm +define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 { + store float 0.000000e+00, float addrspace(3)* %lds + %c0 = extractelement <2 x i32> %c, i32 0 + %c1 = extractelement <2 x i32> %c, i32 1 + %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0) + %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4 + store float 0.000000e+00, float addrspace(3)* %tmp2 + ret float %tex +} + +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +declare float @llvm.amdgcn.image.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.image.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare void @llvm.amdgcn.image.store.1d.f32.i32(float, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float>, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.gfx90a.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A %s + +; GFX90A-LABEL: {{^}}sample_1d: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf +define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GFX90A-LABEL: {{^}}sample_1d_lwe: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf lwe +define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GFX90A-LABEL: {{^}}sample_2d: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf +define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GFX90A-LABEL: {{^}}sample_3d: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf +define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GFX90A-LABEL: {{^}}sample_cube: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf da +define amdgpu_ps <4 x float> @sample_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GFX90A-LABEL: {{^}}sample_1darray: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf da +define amdgpu_ps <4 x float> @sample_1darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GFX90A-LABEL: {{^}}sample_1d_unorm: +; GFX90A-NOT: s_wqm_b64 +; GFX90A: image_sample v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], s[{{[0-9:]+}}] dmask:0xf unorm +define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll @@ -0,0 +1,142 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], +define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], +define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_4x4x2bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]], +define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_f32_16x16x8bf16 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]], +define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i32 1 to <2 x i16> + %b = bitcast i32 2 to <2 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll @@ -0,0 +1,194 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32) +declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32) +declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32) +declare i32 @llvm.amdgcn.workitem.id.x() + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k: +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GFX90A-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX90A: v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg + %a = bitcast i64 1 to <4 x i16> + %b = bitcast i64 2 to <4 x i16> + %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) + store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k: +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i64 1 to <4 x i16> + %b = bitcast i64 2 to <4 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k: +; GCN-DAG: s_load_dwordx4 +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]], +define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i64 1 to <4 x i16> + %b = bitcast i64 2 to <4 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k: +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %a = bitcast i64 1 to <4 x i16> + %b = bitcast i64 2 to <4 x i16> + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k: +; GCN-DAG: s_load_dwordx4 +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9+]}}], v{{\[}}[[TWO]]:{{[0-9+]}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, [[RES]], +define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %a = bitcast i64 1 to <4 x i16> + %b = bitcast i64 2 to <4 x i16> + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64: +; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} +; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 +; GCN: global_store_dwordx2 +define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) { +bb: + %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0) + %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3) + store double %mai.2, double addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64: +; GCN: s_load_dwordx8 +; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN: global_store_dwordx4 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) { +bb: + %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg + %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3) + store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm: +; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}} +; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3 +; GCN: global_store_dwordx4 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) { +bb: + %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) + %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3) + store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm: +; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} +; GCN: global_store_dwordx4 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) { +bb: + %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) + store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit: +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000 +; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}} +; GCN: global_store_dwordx4 +; GCN: global_store_dwordx4 +define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) { +bb: + %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> , i32 0, i32 0, i32 0) + store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg + ret void +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s + +declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) +declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_mfma_i32_16x16x16i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]] +define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) @@ -14,93 +15,56 @@ declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) -declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32) -declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32) -declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32) -declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) -declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) -declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32) -declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908-COUNT-2: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32(<32 x float> addrspace(1)* %arg) { bb: %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg @@ -110,46 +74,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT: v_accvgpr_read_b32 +; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_16x16x1f32(<16 x float> addrspace(1)* %arg) { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -159,19 +92,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32(<4 x float> addrspace(1)* %arg) { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -181,46 +110,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx16 +; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_32x32x2f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x2f32(<16 x float> addrspace(1)* %arg) { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -230,19 +128,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f32: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x4f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN: s_load_dwordx4 +; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_16x16x4f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_16x16x4f32(<4 x float> addrspace(1)* %arg) { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -252,81 +146,14 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x4f16: -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GFX908_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_32x32x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x4f16(<32 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg @@ -339,44 +166,13 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x4f16: -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN: s_load_dwordx16 +; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_16x16x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x4f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -389,19 +185,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x4f16: -; GCN: s_load_dwordx4 -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_4x4x4f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx2 +; GCN: s_load_dwordx2 +; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_4x4x4f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x4f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -414,46 +206,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16: -; GCN: s_load_dwordx16 -; GCN: s_waitcnt lgkmcnt(0) -; GCN: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN: s_load_dwordx16 +; GCN: s_waitcnt lgkmcnt(0) +; GFX908_A: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} +; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x8f16(<16 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -466,18 +227,14 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x16f16: -; GCN: s_load_dwordx4 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x16f16 a[{{[0-9]+:[0-9]+}}], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 +; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_f32_16x16x16f16 [[RES:a\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_16x16x16f16(<4 x float> addrspace(1)* %arg, <4 x half> addrspace(1)* %c) { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -490,83 +247,47 @@ } ; GCN-LABEL: {{^}}test_mfma_i32_32x32x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_load_dwordx16 +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_i32_32x32x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_i32_32x32x4i8(<32 x i32> addrspace(1)* %arg) { bb: %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg @@ -576,46 +297,15 @@ } ; GCN-LABEL: {{^}}test_mfma_i32_16x16x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx16 +; GFX908_A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_i32_16x16x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_i32_16x16x4i8(<16 x i32> addrspace(1)* %arg) { bb: %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg @@ -625,19 +315,15 @@ } ; GCN-LABEL: {{^}}test_mfma_i32_4x4x4i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_i32_4x4x4i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; GCN: s_load_dwordx4 +; GFX908_A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908_A: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_i32_4x4x4i8(<4 x i32> addrspace(1)* %arg) { bb: %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg @@ -646,318 +332,9 @@ ret void } -; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_i32_32x32x8i8(<16 x i32> addrspace(1)* %arg) { -bb: - %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg - %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) - store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_i32_16x16x16i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_i32_16x16x16i8(<4 x i32> addrspace(1)* %arg) { -bb: - %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg - %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) - store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: s_load_dwordx16 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(<32 x float> addrspace(1)* %arg) { -bb: - %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg - %a = bitcast i32 1 to <2 x i16> - %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3) - store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_f32_16x16x2bf16: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(<16 x float> addrspace(1)* %arg) { -bb: - %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg - %a = bitcast i32 1 to <2 x i16> - %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) - store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_f32_4x4x2bf16: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_4x4x2bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(<4 x float> addrspace(1)* %arg) { -bb: - %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg - %a = bitcast i32 1 to <2 x i16> - %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) - store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx16 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x4bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(<16 x float> addrspace(1)* %arg) { -bb: - %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg - %a = bitcast i32 1 to <2 x i16> - %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3) - store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg - ret void -} - -; GCN-LABEL: {{^}}test_mfma_f32_16x16x8bf16: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 -; GCN: s_load_dwordx4 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_16x16x8bf16 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(<4 x float> addrspace(1)* %arg) { -bb: - %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg - %a = bitcast i32 1 to <2 x i16> - %b = bitcast i32 2 to <2 x i16> - %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3) - store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg - ret void -} - ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_forward_acc: -; GCN: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +; GFX908_A: v_mfma_f32_32x32x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908_A-NEXT: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(<32 x float> addrspace(1)* %arg) { bb: %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg @@ -968,8 +345,8 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_forward_acc: -; GCN: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +; GFX908_A: v_mfma_f32_16x16x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908_A-NEXT: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(<16 x float> addrspace(1)* %arg) { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -980,8 +357,8 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_forward_acc: -; GCN: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] +; GFX908_A: v_mfma_f32_4x4x1f32 [[MAI1:a\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908_A-NEXT: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, [[MAI1]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(<4 x float> addrspace(1)* %arg) { bb: %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg @@ -992,19 +369,19 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 ; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 +; GFX90A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], 1.0 +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(<4 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -1013,31 +390,16 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GFX90A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(<16 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) @@ -1046,31 +408,16 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: -; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 -; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 +; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 +; GFX90A: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(<16 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> , <4 x half> , <16 x float> , i32 0, i32 0, i32 0) @@ -1079,51 +426,16 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 -; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] -; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 +; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(<32 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -1132,16 +444,17 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 -; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]], define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(<4 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) @@ -1150,43 +463,15 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 2.0 +; GFX908-COUNT-14: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GFX90A-COUNT-14: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX908_A: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908-COUNT-16: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-4: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(<16 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) @@ -1195,79 +480,73 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: v_accvgpr_read_b32 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 -; GCN-DAG: global_store_dwordx4 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX90A-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}} +; GFX908_A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-8: global_store_dwordx4 {{v[0-9]+}}, a[{{[0-9:]+}}], define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(<32 x float> addrspace(1)* %arg) { bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) @@ -1276,17 +555,19 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat: -; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GCN: v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] +; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] +; GFX90A: v_accvgpr_mov_b32 a{{[0-9]+}}, [[TTMPA]] +; GFX908_A: v_mfma_f32_4x4x1f32 [[RES:a\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: global_store_dwordx4 {{v[0-9]+}}, [[RES]] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(<4 x float> addrspace(1)* %arg, i64 %idx) { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1301,22 +582,21 @@ ; in the middle of the expanded agpr reg_sequence. The broadcast of ; the individual AGPR->AGPR components should avoid the intermediate AGPR case. ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code: -; GCN: v_mov_b32_e32 [[VTMP0:v[0-9]+]], 0x42f60000 -; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[VTMP0]] -; GCN: s_nop 0 -; GCN: v_accvgpr_read_b32 [[VTMP1:v[0-9]+]], [[AGPR]] -; GCN: v_accvgpr_read_b32 [[VTMP2:v[0-9]+]], [[AGPR]] -; GCN: v_accvgpr_read_b32 [[VTMP3:v[0-9]+]], [[AGPR]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[VTMP3]] +; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000 +; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]] +; GFX908: s_nop 0 +; GFX908: v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]] +; GFX908: v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]] +; GFX908: v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]] ; GCN: s_nop 0 -; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: v_accvgpr_read_b32 -; GCN: global_store_dwordx4 +; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}] +; GFX908-COUNT-4: v_accvgpr_read_b32 +; GFX908: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GFX90A: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(<4 x float> addrspace(1)* %arg) { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1328,35 +608,18 @@ } ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg: -; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GFX90A-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: global_load_dwordx4 -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} -; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 -; GCN-COUNT-32: v_accvgpr_read_b32 -; GCN-COUNT-8: global_store_dwordx4 +; GCN-COUNT-8: global_load_dwordx4 +; GFX908-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} +; GFX908-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 +; GFX90A-NOT: v_accvgpr_write +; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-COUNT-5: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(<32 x float> addrspace(1)* %arg) { bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=ALL,MESA %s ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,PACKED-TID %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i32 @llvm.amdgcn.workitem.id.y() #0 @@ -17,7 +18,10 @@ ; CO-V2: enable_vgpr_workitem_id = 0 ; ALL-NOT: v0 -; ALL: {{buffer|flat}}_store_dword {{.*}}v0 +; ALL: {{buffer|flat|global}}_store_dword {{.*}}v0 + +; CO-V3: .amdhsa_system_vgpr_workitem_id 0 +; PACKED-TID: .amdhsa_system_vgpr_workitem_id 0 define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.x() store i32 %id, i32 addrspace(1)* %out @@ -31,8 +35,14 @@ ; ALL-LABEL: {{^}}test_workitem_id_y: ; CO-V2: enable_vgpr_workitem_id = 1 -; ALL-NOT: v1 -; ALL: {{buffer|flat}}_store_dword {{.*}}v1 +; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 +; PACKED-TID: {{buffer|flat|global}}_store_dword {{.*}}[[ID]] + +; UNPACKED-TID-NOT: v1 +; UNPACKED-TID: {{buffer|flat}}_store_dword {{.*}}v1 + +; CO-V3: .amdhsa_system_vgpr_workitem_id 1 +; PACKED-TID: .amdhsa_system_vgpr_workitem_id 1 define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.y() store i32 %id, i32 addrspace(1)* %out @@ -46,8 +56,14 @@ ; ALL-LABEL: {{^}}test_workitem_id_z: ; CO-V2: enable_vgpr_workitem_id = 2 -; ALL-NOT: v2 -; ALL: {{buffer|flat}}_store_dword {{.*}}v2 +; PACKED-TID: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 +; PACKED-TID: {{buffer|flat|global}}_store_dword {{.*}}[[ID]] + +; UNPACKED-TID-NOT: v2 +; UNPACKED-TID: {{buffer|flat}}_store_dword {{.*}}v2 + +; CO-V3: .amdhsa_system_vgpr_workitem_id 2 +; PACKED-TID: .amdhsa_system_vgpr_workitem_id 2 define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 { %id = call i32 @llvm.amdgcn.workitem.id.z() store i32 %id, i32 addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.pow-gfx9.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx908 | FileCheck %s --check-prefixes=GCN,GFX908 +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a | FileCheck %s --check-prefixes=GCN,GFX90A + +; GCN-LABEL: {{^}}mul_legacy +; GFX908: v_mul_legacy_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX90A: v_mul_legacy_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @mul_legacy( + float addrspace(1)* %r, + float addrspace(1)* %a, + float addrspace(1)* %b) { +entry: + %a.val = load volatile float, float addrspace(1)* %a + %b.val = load volatile float, float addrspace(1)* %b + %r.val = call float @llvm.pow.f32(float %a.val, float %b.val) + store float %r.val, float addrspace(1)* %r + ret void +} + +declare float @llvm.pow.f32(float ,float ) readonly Index: llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir +++ llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir @@ -203,7 +203,7 @@ ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]] - ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + ; CHECK: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) ; CHECK: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]] ; CHECK: bb.2: ; CHECK: successors: %bb.3(0x80000000) @@ -233,7 +233,7 @@ bb.1: %11:sreg_64_xexec = COPY %13 - dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) + dead %6:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4, addrspace 1) %14:sreg_64_xexec = COPY %11 bb.2: Index: llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir @@ -0,0 +1,1310 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: valu_write_vgpr_sgemm_mfma_read +# GCN: V_MOV_B32 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: valu_write_vgpr_sgemm_mfma_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: valu_write_agpr_sgemm_mfma_read +# GCN: V_ACCVGPR_WRITE_B32_e64 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: valu_write_agpr_sgemm_mfma_read +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $agpr4 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: valu_write_vgpr_dgemm_mfma_read +# GCN: V_MOV_B32 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: valu_write_vgpr_dgemm_mfma_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: accmov_write_agpr_sgemm_mfma_read +# GCN: V_ACCVGPR_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: accmov_write_agpr_sgemm_mfma_read +body: | + bb.0: + $vgpr0 = IMPLICIT_DEF + $agpr4 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm_mfma_write_agpr_mfma_read_same_agpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: sgemm_mfma_write_agpr_mfma_read_same_agpr_as_srcc +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: sgemm_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr5, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_mfma_read_same_vgpr_as_srcc +body: | + bb.0: + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr6, $vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_agpr_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_vgpr_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm32x32_mfma_write_agpr_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm32x32_mfma_write_vgpr_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_read_overlap +body: | + bb.0: + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr10, $vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_vgpr_dgemm_mfma_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: sgemm32x32_mfma_write_vgpr_dgemm_mfma_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr26, $vgpr27, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_read_partial +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_agpr_mfma_read_partial +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_vgpr_mfma_read_partial +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_vgpr_mfma_read_partial +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr20_agpr21_agpr22_agpr23 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr20_agpr21_agpr22_agpr23, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr0, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_dmfma4x4_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_dmfma16x16_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_mfma_srca_read_overlap +body: | + bb.0: + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr2_vgpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_mfma_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_sgemm_mfma_srca_read_overlap +body: | + bb.0: + $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_sgemm_mfma_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_dgemm_mfma_srca_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $agpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $agpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_vgpr_mfma_srcb_read_overlap +body: | + bb.0: + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_vgpr_mfma_srcb_read_overlap +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr0_vgpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_vm_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: BUFFER_STORE_DWORD +name: smfma4x4_write_vgpr_vm_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_flat_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: FLAT_STORE_DWORD +name: smfma4x4_write_vgpr_flat_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: smfma4x4_write_vgpr_lds_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: DS_WRITE_B32 +name: smfma4x4_write_vgpr_lds_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + DS_WRITE_B32 $vgpr0, $vgpr4, 0, 0, implicit $m0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_exp_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: EXP_DONE +name: smfma4x4_write_vgpr_exp_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + EXP_DONE 12, $vgpr4, $vgpr0, $vgpr0, $vgpr0, 0, 0, 15, implicit $exec +... +# GCN-LABEL: name: smfma16x16_write_vgpr_flat_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: FLAT_STORE_DWORD +name: smfma16x16_write_vgpr_flat_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: smfma32x32_write_vgpr_flat_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: FLAT_STORE_DWORD +name: smfma32x32_write_vgpr_flat_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr16_vgpr17, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma4x4_write_vgpr_flat_read_overlap +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma4x4_write_vgpr_flat_read_full +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma4x4_write_vgpr_flat_read_full +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma16x16_write_vgpr_flat_read +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: smfma4x4_write_vgpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MOV_B32 +name: smfma4x4_write_vgpr_valu_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_write_vgpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: smfma16x16_write_vgpr_valu_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_write_vgpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: smfma32x32_write_vgpr_valu_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MOV_B32 +name: dmfma4x4_write_vgpr_valu_read +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_MOV_B32_e32 $vgpr4, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: dmfma16x16_write_vgpr_valu_read +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr12 = V_MOV_B32_e32 $vgpr4, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_accv_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: smfma4x4_write_vgpr_accv_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_write_vgpr_accv_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: smfma16x16_write_vgpr_accv_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_write_vgpr_accv_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: smfma32x32_write_vgpr_accv_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_dot_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_DOT +name: smfma4x4_write_vgpr_dot_read +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_write_vgpr_dot_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_DOT +name: dmfma4x4_write_vgpr_dot_read +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr1, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_DOT +name: dmfma16x16_write_vgpr_dot_read +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr1, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MOV_B32 +name: smfma4x4_write_vgpr_valu_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr4, $vgpr0, $vgpr6_vgpr7_vgpr8_vgpr9, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_write_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: smfma16x16_write_vgpr_valu_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_write_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: smfma32x32_write_vgpr_valu_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_valu_f16_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_FMA_F16_e64 +name: smfma4x4_write_vgpr_valu_f16_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_write_vgpr_valu_f16_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_FMA_F16_e64 +name: smfma16x16_write_vgpr_valu_f16_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_write_vgpr_valu_f16_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_FMA_F16_e64 +name: smfma32x32_write_vgpr_valu_f16_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_FMA_F16_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_valu_sdwa_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MOV_B32_sdwa +name: smfma4x4_write_vgpr_valu_sdwa_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0) +... +# GCN-LABEL: name: smfma16x16_write_vgpr_valu_sdwa_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32_sdwa +name: smfma16x16_write_vgpr_valu_sdwa_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr16, $vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0) +... +# GCN-LABEL: name: smfma32x32_write_vgpr_valu_sdwa_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32_sdwa +name: smfma32x32_write_vgpr_valu_sdwa_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_sdwa 0, $vgpr16, 0, 5, 2, 4, implicit $exec, implicit $vgpr1(tied-def 0) +... +# GCN-LABEL: name: dmfma4x4_write_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MOV_B32 +name: dmfma4x4_write_vgpr_valu_write +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32 +name: dmfma16x16_write_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr3 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_accv_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_ACCVGPR_READ_B32_e64 +name: smfma4x4_write_vgpr_accv_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_write_vgpr_dot_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_DOT +name: smfma4x4_write_vgpr_dot_write +body: | + bb.0: + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr1, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr4, 0, $vgpr4, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_read_srcc_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MOV_B32 +name: smfma4x4_read_srcc_vgpr_valu_write +body: | + bb.0: + $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 6 +# GCN-NEXT: V_MOV_B32 +name: smfma16x16_read_srcc_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_read_srcc_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 6 +# GCN-NEXT: V_MOV_B32 +name: smfma32x32_read_srcc_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr0, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_read_srca_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma4x4_read_srca_vgpr_valu_write +body: | + bb.0: + $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr8 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_read_srca_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma16x16_read_srca_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_read_srca_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma32x32_read_srca_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr18 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma4x4_read_srcb_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma4x4_read_srcb_vgpr_valu_write +body: | + bb.0: + $vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $vgpr8, $vgpr9, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr9 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_read_srcb_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma16x16_read_srcb_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_16X16X1F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma32x32_read_srcb_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: smfma32x32_read_srcb_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F32_32X32X2F32_vgprcd_e64 $vgpr18, $vgpr19, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec + $vgpr19 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_read_srcc_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: dmfma4x4_read_srcc_vgpr_valu_write +body: | + bb.0: + $vgpr3_vgpr4 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_read_srcc_vgpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_MOV_B32 +name: dmfma16x16_read_srcc_vgpr_valu_write +body: | + bb.0: + $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: smfma16x16_read_srcc_vgpr_accv_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 6 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: smfma16x16_read_srcc_vgpr_accv_write +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = V_MFMA_F32_16X16X1F32_e64 $agpr18, $agpr19, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm_to_fma64 +# GCN: V_MFMA +# GCN-NEXT: V_FMA_F64_e64 +name: sgemm_to_fma64 +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm_to_fma64 +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_FMA_F64_e64 +name: dgemm_to_fma64 +body: | + bb.0: + $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm_to_fmac64 +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_FMAC_F64 +name: dgemm_to_fmac64 +body: | + bb.0: + $vgpr0_vgpr1 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec +... +# GCN-LABEL: name: flat_store_data_agpr_overwritten +# GCN: FLAT_STORE_DWORDX4 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: flat_store_data_agpr_overwritten +body: | + bb.0: + FLAT_STORE_DWORDX4 $vgpr4_vgpr5, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr + $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_accv_read +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: dot_write_vgpr_accv_read +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec +... +# GCN-LABEL: name: valu_write_vgpr_dot_read +# GCN: V_MOV_B32 +# GCN-NEXT: V_DOT +name: valu_write_vgpr_dot_read +body: | + bb.0: + $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec +... +# GCN-LABEL: name: accv_write_vgpr_dot_read +# GCN: V_ACCVGPR_READ +# GCN-NEXT: V_DOT +name: accv_write_vgpr_dot_read +body: | + bb.0: + $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcc +# GCN: V_DOT +# GCN-NEXT: V_DOT +name: dot_write_vgpr_same_dot_read_srcc +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_different_dot_read_srcc +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_DOT +name: dot_write_vgpr_different_dot_read_srcc +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr1 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_different_dot_write +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_DOT +name: dot_write_vgpr_different_dot_write +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr4 = V_DOT8_I32_I4 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_different_valu_read +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32_e32 +name: dot_write_vgpr_different_valu_read +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_different_valu_write +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MOV_B32_e32 +name: dot_write_vgpr_different_valu_write +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr4 = V_MOV_B32_e32 1, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srca +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_DOT +name: dot_write_vgpr_same_dot_read_srca +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr4, $vgpr1, $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dot_write_vgpr_same_dot_read_srcb +# GCN: V_DOT +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_DOT +name: dot_write_vgpr_same_dot_read_srcb +body: | + bb.0: + $vgpr4 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr1, $vgpr4, implicit $exec + $vgpr0 = V_DOT4C_I32_I8_e32 $vgpr0, $vgpr4, $vgpr0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: vcmpx_write_exec_mfma +# GCN: V_CMPX_EQ_I32_e32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_MFMA +name: vcmpx_write_exec_mfma +body: | + bb.0: + implicit $exec, implicit $vcc = V_CMPX_EQ_I32_e32 $vgpr0, $vgpr1, implicit $exec + $agpr4_agpr5_agpr6_agpr7 = V_MFMA_F32_4X4X1F32_e64 killed $agpr8, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: valu_write_agpr_dgemm_mfma_read +# GCN: V_ACCVGPR_WRITE_B32_e64 +# GCN: V_ACCVGPR_WRITE_B32_e64 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_MFMA +name: valu_write_agpr_dgemm_mfma_read +body: | + bb.0: + $agpr0 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_mfma_read_same_agpr_as_srcc +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $agpr10_agpr11, $agpr10_agpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc +# GCN: V_MFMA +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_agpr_mfma_read_same_agpr_as_srcc +body: | + bb.0: + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_agpr_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_sgemm_mfma_read_overlap +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_dgemm_mfma_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr10, $vgpr11, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: sgemm16x16_mfma_write_sgpr_dgemm_mfma_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: V_MFMA +name: sgemm32x32_mfma_write_agpr_dgemm_mfma_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr26, $vgpr27, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr20_vgpr21, $vgpr20_vgpr21, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_dmfma4x4_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_dmfma16x16_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_agpr_mfma_srca_read_overlap +body: | + bb.0: + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr2_agpr3, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr10_vgpr11, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_agpr_sgemm_mfma_srca_read_overlap +body: | + bb.0: + $agpr4_agpr5 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_sgemm_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_MFMA_F32_4X4X1F32_vgprcd_e64 $agpr4, $vgpr0, $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 4 +# GCN-NEXT: V_MFMA +name: sgemm4x4_mfma_write_agpr_dgemm_mfma_srca_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $agpr0_agpr1, $vgpr4_vgpr5, $vgpr4_vgpr5, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_MFMA +name: dgemm4x4_mfma_write_agpr_mfma_srcb_read_overlap +body: | + bb.0: + $agpr2_agpr3 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_vgpr3 = V_MFMA_F64_4X4X4F64_vgprcd_e64 $vgpr0_vgpr1, $agpr2_agpr3, $vgpr0_vgpr1, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_MFMA +name: dgemm16x16_mfma_write_agpr_mfma_srcb_read_overlap +body: | + bb.0: + $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17, 0, 0, 0, implicit $mode, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_MFMA_F64_16X16X4F64_vgprcd_e64 $vgpr10_vgpr11, $agpr0_agpr1, $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, 0, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_overlap +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma4x4_write_agpr_flat_read_overlap +body: | + bb.0: + $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma4x4_write_agpr_flat_read_full +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 0 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma4x4_write_agpr_flat_read_full +body: | + bb.0: + $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORDX2 $vgpr0_vgpr1, $agpr3_agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: FLAT_STORE_DWORD +name: dmfma16x16_write_agpr_flat_read +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + FLAT_STORE_DWORD $vgpr0_vgpr1, $agpr4, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit $flat_scr +... +# GCN-LABEL: name: dmfma4x4_write_agpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_ACCVGPR_READ_B32_e64 +name: dmfma4x4_write_agpr_valu_read +body: | + bb.0: + $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_ACCVGPR_READ_B32_e64 +name: dmfma16x16_write_agpr_valu_read +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_write_agpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 5 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: dmfma4x4_write_agpr_valu_write +body: | + bb.0: + $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: S_NOP 7 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: dmfma16x16_write_agpr_valu_write +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr3 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma4x4_read_srcc_agpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: dmfma4x4_read_srcc_agpr_valu_write +body: | + bb.0: + $agpr3_agpr4 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dmfma16x16_read_srcc_agpr_valu_write +# GCN: V_MFMA +# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64 +name: dmfma16x16_read_srcc_agpr_valu_write +body: | + bb.0: + $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = V_MFMA_F64_16X16X4F64_e64 $vgpr10_vgpr11, $vgpr10_vgpr11, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, 0, 0, 0, implicit $mode, implicit $exec + $agpr1 = V_ACCVGPR_WRITE_B32_e64 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm_accvgr_to_fma64 +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_FMA_F64_e64 +name: dgemm_accvgr_to_fma64 +body: | + bb.0: + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $vgpr0_vgpr1, $vgpr0_vgpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_FMA_F64_e64 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $exec +... +# GCN-LABEL: name: dgemm_accvgr_to_fmac64 +# GCN: V_MFMA +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_FMAC_F64 +name: dgemm_accvgr_to_fmac64 +body: | + bb.0: + $agpr0_agpr1 = V_MFMA_F64_4X4X4F64_e64 $agpr0_agpr1, $agpr0_agpr1, $agpr0_agpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4_vgpr5 = V_FMAC_F64_e32 $vgpr4_vgpr5, $vgpr4_vgpr5, $vgpr4_vgpr5, implicit $mode, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/mai-hazards.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -312,7 +312,7 @@ body: | bb.0: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- @@ -336,7 +336,7 @@ body: | bb.0: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- @@ -348,7 +348,7 @@ body: | bb.0: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -373,7 +373,7 @@ bb.0: $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- @@ -387,7 +387,7 @@ bb.0: $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- @@ -401,7 +401,7 @@ bb.0: $vgpr0 = V_MOV_B32_e32 1, implicit $exec $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- @@ -415,6 +415,6 @@ bb.0: $vgpr0 = V_MOV_B32_e32 1, implicit $exec $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... --- Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -81,7 +81,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 0 S_WAITCNT 127 - $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) + $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -25,6 +27,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread") acquire ret void @@ -50,6 +62,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread") release ret void @@ -75,6 +97,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread") acq_rel ret void @@ -100,6 +132,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread") seq_cst ret void @@ -125,6 +167,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread-one-as") acquire ret void @@ -150,6 +202,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread-one-as") release ret void @@ -175,6 +237,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -200,6 +272,16 @@ ; SKIP-CACHE-INV-LABEL: singlethread_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -225,6 +307,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront") acquire ret void @@ -250,6 +342,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront") release ret void @@ -275,6 +377,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront") acq_rel ret void @@ -300,6 +412,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront") seq_cst ret void @@ -325,6 +447,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront-one-as") acquire ret void @@ -350,6 +482,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront-one-as") release ret void @@ -375,6 +517,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -400,6 +552,16 @@ ; SKIP-CACHE-INV-LABEL: wavefront_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -432,6 +594,19 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup") acquire ret void @@ -463,6 +638,18 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup") release ret void @@ -495,6 +682,19 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup") acq_rel ret void @@ -527,6 +727,19 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup") seq_cst ret void @@ -555,6 +768,18 @@ ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acquire_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup-one-as") acquire ret void @@ -582,6 +807,17 @@ ; SKIP-CACHE-INV-LABEL: workgroup_one_as_release_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup-one-as") release ret void @@ -610,6 +846,18 @@ ; SKIP-CACHE-INV-LABEL: workgroup_one_as_acq_rel_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -638,6 +886,18 @@ ; SKIP-CACHE-INV-LABEL: workgroup_one_as_seq_cst_fence: ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: workgroup_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -676,6 +936,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent") acquire ret void @@ -708,6 +982,18 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent") release ret void @@ -746,6 +1032,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent") acq_rel ret void @@ -784,6 +1084,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent") seq_cst ret void @@ -822,6 +1136,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent-one-as") acquire ret void @@ -854,6 +1182,18 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent-one-as") release ret void @@ -892,6 +1232,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent-one-as") acq_rel ret void @@ -930,6 +1284,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: agent_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: agent_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("agent-one-as") seq_cst ret void @@ -968,6 +1336,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence acquire ret void @@ -1000,6 +1388,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence release ret void @@ -1038,6 +1440,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence acq_rel ret void @@ -1076,6 +1498,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence seq_cst ret void @@ -1114,6 +1556,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acquire_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_one_as_acquire_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("one-as") acquire ret void @@ -1146,6 +1608,20 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_one_as_release_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_one_as_release_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("one-as") release ret void @@ -1184,6 +1660,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_one_as_acq_rel_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_one_as_acq_rel_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("one-as") acq_rel ret void @@ -1222,6 +1718,26 @@ ; SKIP-CACHE-INV: ; %bb.0: ; %entry ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: system_one_as_seq_cst_fence: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: system_one_as_seq_cst_fence: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; entry: fence syncscope("one-as") seq_cst ret void Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -56,6 +58,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 @@ -115,6 +143,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 @@ -182,6 +236,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 @@ -255,6 +338,37 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 @@ -308,6 +422,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 @@ -360,6 +494,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 @@ -418,6 +572,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") release, align 4 @@ -476,6 +652,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 @@ -528,6 +726,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic @@ -591,6 +809,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire @@ -649,6 +891,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release @@ -718,6 +982,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel @@ -787,6 +1077,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst @@ -855,6 +1171,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire @@ -930,6 +1273,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel @@ -1005,6 +1377,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst @@ -1070,6 +1471,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1146,6 +1567,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1217,6 +1662,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1299,6 +1766,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1381,6 +1874,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1457,6 +1976,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1539,6 +2082,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1621,6 +2190,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1703,6 +2298,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1785,6 +2406,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1874,6 +2521,33 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1971,6 +2645,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2068,6 +2771,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2159,6 +2891,33 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2256,6 +3015,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2353,6 +3141,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2450,6 +3267,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2547,6 +3393,35 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2608,6 +3483,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 @@ -2667,6 +3568,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 @@ -2735,6 +3662,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 @@ -2809,6 +3765,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 @@ -2862,6 +3849,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 @@ -2914,6 +3921,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 @@ -2972,6 +3999,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 @@ -3030,6 +4079,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 @@ -3082,6 +4153,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic @@ -3143,6 +4234,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire @@ -3201,6 +4316,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release @@ -3268,6 +4405,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3335,6 +4498,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -3403,6 +4592,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire @@ -3478,6 +4694,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3553,6 +4798,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -3618,6 +4892,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3692,6 +4986,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3763,6 +5081,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3843,6 +5183,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3923,6 +5289,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3997,6 +5389,30 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4077,6 +5493,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4157,6 +5599,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4237,6 +5705,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4317,6 +5811,32 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4407,6 +5927,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4505,6 +6052,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4603,6 +6179,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4695,6 +6300,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4793,6 +6425,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4891,6 +6552,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4989,6 +6679,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5087,6 +6806,35 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -56,6 +58,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc slc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc slc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4, !nontemporal !0 @@ -121,6 +151,38 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -182,6 +244,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load i32, i32* %in, align 4 @@ -247,6 +337,38 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[2:3] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3 +; GFX90A-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX90A-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -56,6 +58,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 @@ -115,6 +145,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 @@ -174,6 +232,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 @@ -233,6 +319,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 @@ -286,6 +400,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 @@ -338,6 +474,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 @@ -390,6 +548,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 @@ -442,6 +622,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 @@ -494,6 +696,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic @@ -546,6 +770,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -598,6 +844,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release @@ -650,6 +918,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -702,6 +992,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -762,6 +1074,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire @@ -823,6 +1161,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel @@ -884,6 +1248,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst @@ -949,6 +1339,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1014,6 +1426,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1079,6 +1513,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1144,6 +1600,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1209,6 +1687,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1274,6 +1774,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1339,6 +1861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1404,6 +1948,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1469,6 +2035,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1534,6 +2122,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1615,6 +2225,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1698,6 +2334,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1781,6 +2443,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1864,6 +2552,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1947,6 +2661,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2030,6 +2770,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2113,6 +2879,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2196,6 +2988,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2257,6 +3075,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 @@ -2316,6 +3162,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -2375,6 +3249,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 @@ -2434,6 +3336,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -2487,6 +3417,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 @@ -2539,6 +3491,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -2591,6 +3565,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 @@ -2643,6 +3639,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -2695,6 +3713,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -2747,6 +3787,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -2799,6 +3861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release @@ -2851,6 +3935,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -2903,6 +4009,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -2963,6 +4091,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -3024,6 +4178,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -3085,6 +4265,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -3150,6 +4356,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3215,6 +4443,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3280,6 +4530,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3345,6 +4617,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3410,6 +4704,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3475,6 +4791,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3540,6 +4878,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3605,6 +4965,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3670,6 +5052,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3735,6 +5139,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3816,6 +5242,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3899,6 +5351,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3982,6 +5460,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4065,6 +5569,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4148,6 +5678,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4231,6 +5787,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4314,6 +5896,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4397,6 +6005,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -56,6 +58,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in unordered, align 4 @@ -115,6 +143,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in monotonic, align 4 @@ -182,6 +236,38 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in acquire, align 4 @@ -255,6 +341,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in seq_cst, align 4 @@ -308,6 +428,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out unordered, align 4 @@ -360,6 +500,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out monotonic, align 4 @@ -418,6 +578,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out release, align 4 @@ -476,6 +660,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out seq_cst, align 4 @@ -528,6 +736,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic @@ -591,6 +819,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acquire @@ -649,6 +905,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in release @@ -718,6 +998,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel @@ -787,6 +1099,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst @@ -855,6 +1199,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acquire @@ -930,6 +1304,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel @@ -1005,6 +1413,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst @@ -1070,6 +1512,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1146,6 +1608,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1217,6 +1707,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1299,6 +1813,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1381,6 +1927,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1457,6 +2035,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1539,6 +2145,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1621,6 +2259,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1703,6 +2373,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1785,6 +2487,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1874,6 +2608,36 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1971,6 +2735,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2068,6 +2866,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2159,6 +2991,36 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2256,6 +3118,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2353,6 +3249,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2450,6 +3380,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2547,6 +3511,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2608,6 +3606,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 @@ -2667,6 +3691,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 @@ -2735,6 +3785,39 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 @@ -2809,6 +3892,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 @@ -2862,6 +3980,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 @@ -2914,6 +4052,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 @@ -2972,6 +4130,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 @@ -3030,6 +4212,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 @@ -3082,6 +4288,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic @@ -3143,6 +4369,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire @@ -3201,6 +4455,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release @@ -3268,6 +4546,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel @@ -3335,6 +4645,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst @@ -3403,6 +4745,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire @@ -3478,6 +4851,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel @@ -3553,6 +4961,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst @@ -3618,6 +5061,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3692,6 +5155,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3763,6 +5254,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3843,6 +5358,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3923,6 +5470,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3997,6 +5576,34 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4077,6 +5684,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4157,6 +5796,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4237,6 +5908,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4317,6 +6020,38 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4407,6 +6142,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4505,6 +6271,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4603,6 +6404,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4695,6 +6531,37 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4793,6 +6660,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4891,6 +6793,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4989,6 +6926,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -5087,6 +7059,41 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -56,6 +58,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 @@ -115,6 +145,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 @@ -174,6 +232,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 @@ -233,6 +319,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 @@ -286,6 +400,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 @@ -338,6 +474,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 @@ -390,6 +548,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 @@ -442,6 +622,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 @@ -494,6 +696,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic @@ -546,6 +770,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -598,6 +844,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release @@ -650,6 +918,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -702,6 +992,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -762,6 +1074,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire @@ -823,6 +1161,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel @@ -884,6 +1248,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst @@ -949,6 +1339,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1014,6 +1426,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1079,6 +1513,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1144,6 +1600,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1209,6 +1687,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1274,6 +1774,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1339,6 +1861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1404,6 +1948,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1469,6 +2035,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1534,6 +2122,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1615,6 +2225,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1698,6 +2334,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1781,6 +2443,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1864,6 +2552,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1947,6 +2661,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2030,6 +2770,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2113,6 +2879,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2196,6 +2988,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2257,6 +3075,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 @@ -2316,6 +3162,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -2375,6 +3249,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 @@ -2434,6 +3336,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -2487,6 +3417,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 @@ -2539,6 +3491,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -2591,6 +3565,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 @@ -2643,6 +3639,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -2695,6 +3713,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -2747,6 +3787,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -2799,6 +3861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release @@ -2851,6 +3935,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -2903,6 +4009,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -2963,6 +4091,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -3024,6 +4178,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -3085,6 +4265,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -3150,6 +4356,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3215,6 +4443,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3280,6 +4530,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3345,6 +4617,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3410,6 +4704,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3475,6 +4791,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3540,6 +4878,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3605,6 +4965,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3670,6 +5052,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3735,6 +5139,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3816,6 +5242,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3899,6 +5351,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3982,6 +5460,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4065,6 +5569,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4148,6 +5678,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4231,6 +5787,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4314,6 +5896,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4397,6 +6005,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -56,6 +58,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 @@ -115,6 +143,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 @@ -179,6 +233,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 @@ -248,6 +330,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 @@ -301,6 +413,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 @@ -353,6 +485,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 @@ -410,6 +562,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 @@ -467,6 +641,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 @@ -519,6 +715,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic @@ -577,6 +793,29 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire @@ -634,6 +873,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release @@ -697,6 +958,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel @@ -760,6 +1046,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst @@ -822,6 +1133,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire @@ -890,6 +1226,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel @@ -958,6 +1321,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst @@ -1023,6 +1413,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1094,6 +1504,29 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1164,6 +1597,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1240,6 +1695,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1316,6 +1796,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1387,6 +1892,29 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1463,6 +1991,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1539,6 +2092,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1615,6 +2193,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1691,6 +2294,31 @@ ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1777,6 +2405,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1870,6 +2523,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -1963,6 +2643,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2051,6 +2758,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2144,6 +2876,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2237,6 +2996,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2330,6 +3116,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2423,6 +3236,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -2484,6 +3324,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 @@ -2543,6 +3409,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -2604,6 +3496,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 @@ -2667,6 +3586,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[2:3], v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_load_dword v0, v[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, s3 +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[2:3], v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %in, i32* %out) { entry: %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -2720,6 +3667,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 @@ -2772,6 +3739,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -2826,6 +3813,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 @@ -2880,6 +3888,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32* %out) { entry: store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -2932,6 +3961,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -2986,6 +4035,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3040,6 +4111,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release @@ -3096,6 +4188,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3152,6 +4267,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3214,6 +4352,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3279,6 +4442,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3344,6 +4533,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3409,6 +4624,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3476,6 +4711,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3543,6 +4800,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3612,6 +4890,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3681,6 +4982,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3748,6 +5072,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3817,6 +5163,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3886,6 +5255,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -3955,6 +5347,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4024,6 +5439,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4107,6 +5545,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4194,6 +5657,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4281,6 +5770,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4366,6 +5881,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4453,6 +5993,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4540,6 +6106,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4627,6 +6219,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 @@ -4714,6 +6332,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -72,6 +74,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4 @@ -146,6 +168,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4 @@ -226,6 +268,28 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4 @@ -311,6 +375,28 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4 @@ -373,6 +459,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4 @@ -434,6 +540,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4 @@ -502,6 +628,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4 @@ -570,6 +718,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4 @@ -631,6 +801,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic @@ -703,6 +893,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire @@ -771,6 +985,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release @@ -850,6 +1086,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel @@ -929,6 +1191,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst @@ -1006,6 +1294,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire @@ -1091,6 +1405,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel @@ -1176,6 +1518,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst @@ -1245,6 +1615,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1325,6 +1715,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1401,6 +1815,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1488,6 +1924,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1575,6 +2037,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1655,6 +2143,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1742,6 +2254,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1829,6 +2367,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1916,6 +2480,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2003,6 +2593,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2090,6 +2706,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2186,6 +2828,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2282,6 +2952,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2371,6 +3069,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2467,6 +3191,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2563,6 +3315,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2659,6 +3439,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2755,6 +3563,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2831,6 +3667,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 @@ -2905,6 +3761,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 @@ -2985,6 +3861,28 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 @@ -3070,6 +3968,28 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 @@ -3132,6 +4052,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 @@ -3193,6 +4133,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 @@ -3261,6 +4221,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 @@ -3329,6 +4311,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 @@ -3390,6 +4394,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic @@ -3462,6 +4486,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire @@ -3530,6 +4578,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release @@ -3609,6 +4679,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3688,6 +4784,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -3765,6 +4887,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire @@ -3850,6 +4998,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3935,6 +5111,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -4004,6 +5208,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4084,6 +5308,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4160,6 +5408,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4247,6 +5517,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4334,6 +5630,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4414,6 +5736,30 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4501,6 +5847,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4588,6 +5960,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4675,6 +6073,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4762,6 +6186,32 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4849,6 +6299,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4945,6 +6421,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5041,6 +6545,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5130,6 +6662,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5226,6 +6784,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5322,6 +6908,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5418,6 +7032,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5514,6 +7156,34 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -67,6 +69,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 @@ -145,6 +171,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_load_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_nontemporal_load_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] glc slc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -216,6 +266,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -289,6 +363,30 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_nontemporal_store_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_nontemporal_store_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -72,6 +74,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4 @@ -146,6 +170,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4 @@ -220,6 +266,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4 @@ -294,6 +362,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4 @@ -356,6 +446,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4 @@ -417,6 +529,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4 @@ -478,6 +612,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4 @@ -539,6 +695,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4 @@ -600,6 +778,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic @@ -661,6 +861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire @@ -722,6 +944,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release @@ -783,6 +1027,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel @@ -844,6 +1110,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst @@ -915,6 +1203,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire @@ -987,6 +1301,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel @@ -1059,6 +1399,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1128,6 +1494,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1197,6 +1585,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1266,6 +1676,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1335,6 +1767,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1404,6 +1858,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1473,6 +1949,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1542,6 +2040,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1611,6 +2131,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1680,6 +2222,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1749,6 +2313,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1830,6 +2416,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1913,6 +2525,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1996,6 +2634,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2079,6 +2743,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2162,6 +2852,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2245,6 +2961,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2328,6 +3070,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2411,6 +3179,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2487,6 +3281,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 @@ -2561,6 +3377,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -2635,6 +3473,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 @@ -2709,6 +3569,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -2771,6 +3653,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 @@ -2832,6 +3736,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -2893,6 +3819,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 @@ -2954,6 +3902,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -3015,6 +3985,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -3076,6 +4068,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -3137,6 +4151,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release @@ -3198,6 +4234,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -3259,6 +4317,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -3330,6 +4410,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -3402,6 +4508,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -3474,6 +4606,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -3543,6 +4701,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3612,6 +4792,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3681,6 +4883,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3750,6 +4974,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3819,6 +5065,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3888,6 +5156,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3957,6 +5247,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4026,6 +5338,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4095,6 +5429,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4164,6 +5520,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4245,6 +5623,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4328,6 +5732,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4411,6 +5841,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4494,6 +5950,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4577,6 +6059,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4660,6 +6168,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4743,6 +6277,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4826,6 +6386,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -72,6 +74,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4 @@ -146,6 +168,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4 @@ -226,6 +268,32 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4 @@ -311,6 +379,32 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 @@ -373,6 +467,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4 @@ -434,6 +548,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4 @@ -502,6 +636,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out release, align 4 @@ -570,6 +728,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 @@ -631,6 +813,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic @@ -703,6 +905,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire @@ -771,6 +1001,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release @@ -850,6 +1104,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel @@ -929,6 +1215,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -1006,6 +1324,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire @@ -1091,6 +1439,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel @@ -1176,6 +1558,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst @@ -1245,6 +1661,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1325,6 +1761,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1401,6 +1865,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1488,6 +1976,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1575,6 +2095,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1655,6 +2207,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1742,6 +2322,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1829,6 +2441,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1916,6 +2560,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2003,6 +2679,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2090,6 +2798,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2186,6 +2924,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2282,6 +3054,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2371,6 +3177,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2467,6 +3303,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2563,6 +3433,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2659,6 +3563,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2755,6 +3693,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2831,6 +3803,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 @@ -2905,6 +3897,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 @@ -2985,6 +3997,32 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 @@ -3070,6 +4108,32 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 @@ -3132,6 +4196,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 @@ -3193,6 +4277,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 @@ -3261,6 +4365,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 @@ -3329,6 +4457,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 @@ -3390,6 +4542,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic @@ -3462,6 +4634,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire @@ -3530,6 +4730,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release @@ -3609,6 +4833,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel @@ -3688,6 +4944,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst @@ -3765,6 +5053,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire @@ -3850,6 +5168,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel @@ -3935,6 +5287,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst @@ -4004,6 +5390,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4084,6 +5490,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4160,6 +5594,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4247,6 +5705,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4334,6 +5824,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4414,6 +5936,34 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4501,6 +6051,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4588,6 +6170,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4675,6 +6289,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4762,6 +6408,38 @@ ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4849,6 +6527,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4945,6 +6653,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5041,6 +6783,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5130,6 +6906,36 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5226,6 +7032,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5322,6 +7162,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5418,6 +7292,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5514,6 +7422,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: buffer_wbl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc scc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_invl2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -72,6 +74,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4 @@ -146,6 +170,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4 @@ -220,6 +266,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4 @@ -294,6 +362,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4 @@ -356,6 +446,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4 @@ -417,6 +529,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4 @@ -478,6 +612,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4 @@ -539,6 +695,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4 @@ -600,6 +778,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic @@ -661,6 +861,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire @@ -722,6 +944,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release @@ -783,6 +1027,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel @@ -844,6 +1110,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst @@ -915,6 +1203,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire @@ -987,6 +1301,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel @@ -1059,6 +1399,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1128,6 +1494,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1197,6 +1585,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1266,6 +1676,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1335,6 +1767,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1404,6 +1858,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1473,6 +1949,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1542,6 +2040,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1611,6 +2131,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1680,6 +2222,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1749,6 +2313,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1830,6 +2416,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1913,6 +2525,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1996,6 +2634,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2079,6 +2743,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2162,6 +2852,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2245,6 +2961,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2328,6 +3070,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2411,6 +3179,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2487,6 +3281,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 @@ -2561,6 +3377,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -2635,6 +3473,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 @@ -2709,6 +3569,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -2771,6 +3653,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 @@ -2832,6 +3736,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -2893,6 +3819,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 @@ -2954,6 +3902,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -3015,6 +3985,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -3076,6 +4068,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -3137,6 +4151,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release @@ -3198,6 +4234,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -3259,6 +4317,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -3330,6 +4410,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -3402,6 +4508,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -3474,6 +4606,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -3543,6 +4701,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3612,6 +4792,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3681,6 +4883,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3750,6 +4974,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3819,6 +5065,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3888,6 +5156,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3957,6 +5247,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4026,6 +5338,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4095,6 +5429,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4164,6 +5520,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4245,6 +5623,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4328,6 +5732,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4411,6 +5841,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4494,6 +5950,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4577,6 +6059,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4660,6 +6168,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4743,6 +6277,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4826,6 +6386,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -72,6 +74,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4 @@ -146,6 +168,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4 @@ -221,6 +263,27 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 @@ -300,6 +363,27 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4 @@ -362,6 +446,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4 @@ -423,6 +527,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4 @@ -490,6 +614,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 @@ -557,6 +703,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4 @@ -618,6 +786,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic @@ -681,6 +869,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire @@ -748,6 +958,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release @@ -817,6 +1049,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel @@ -886,6 +1142,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst @@ -958,6 +1238,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire @@ -1037,6 +1342,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1116,6 +1448,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1185,6 +1544,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1256,6 +1635,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1331,6 +1732,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1408,6 +1831,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1485,6 +1932,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1556,6 +2027,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1633,6 +2126,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1710,6 +2227,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1787,6 +2328,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1864,6 +2429,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -1946,6 +2535,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2036,6 +2650,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2126,6 +2767,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2210,6 +2878,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2300,6 +2993,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2390,6 +3110,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2480,6 +3227,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2570,6 +3344,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -2646,6 +3447,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 @@ -2720,6 +3541,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -2795,6 +3636,27 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 @@ -2871,6 +3733,27 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -2933,6 +3816,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 @@ -2994,6 +3897,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -3057,6 +3980,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 @@ -3120,6 +4064,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -3181,6 +4146,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -3244,6 +4229,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3307,6 +4314,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release @@ -3372,6 +4400,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3437,6 +4488,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3509,6 +4583,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3584,6 +4683,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3659,6 +4784,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3728,6 +4879,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3799,6 +4970,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3870,6 +5063,27 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -3943,6 +5157,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4016,6 +5253,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4087,6 +5347,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4160,6 +5442,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4233,6 +5538,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4306,6 +5634,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4379,6 +5730,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4461,6 +5835,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4547,6 +5946,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4633,6 +6058,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4717,6 +6168,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4803,6 +6279,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4889,6 +6391,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -4975,6 +6503,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 @@ -5061,6 +6615,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir @@ -11,10 +11,10 @@ $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec - renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`) + renamable $vgpr2 = FLAT_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(42)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -30,7 +30,7 @@ $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -47,7 +47,7 @@ $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("workgroup-one-as") seq_cst seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... @@ -63,7 +63,7 @@ $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $sgpr2_sgpr3, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`) + FLAT_ATOMIC_SWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store syncscope("wavefront-one-as") seq_cst 4 on `i32 addrspace(42)* undef`) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -64,6 +66,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4 @@ -130,6 +154,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4 @@ -197,6 +243,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4 @@ -270,6 +339,31 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4 @@ -326,6 +420,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4 @@ -381,6 +493,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4 @@ -442,6 +572,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4 @@ -503,6 +653,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4 @@ -558,6 +728,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic @@ -619,6 +807,26 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire @@ -680,6 +888,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release @@ -747,6 +975,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel @@ -814,6 +1064,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst @@ -880,6 +1152,29 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire @@ -953,6 +1248,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel @@ -1026,6 +1346,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst @@ -1088,6 +1433,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1156,6 +1521,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1224,6 +1611,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1298,6 +1707,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1372,6 +1805,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1440,6 +1897,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1514,6 +1993,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1588,6 +2091,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1662,6 +2189,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1736,6 +2287,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1809,6 +2384,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1890,6 +2490,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1971,6 +2598,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2046,6 +2700,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2127,6 +2806,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2208,6 +2914,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2289,6 +3022,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2370,6 +3130,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2438,6 +3225,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 @@ -2504,6 +3313,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 @@ -2570,6 +3401,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 @@ -2636,6 +3489,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 @@ -2692,6 +3567,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 @@ -2747,6 +3640,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 @@ -2802,6 +3713,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 @@ -2857,6 +3786,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 @@ -2912,6 +3859,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic @@ -2967,6 +3932,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire @@ -3022,6 +4005,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release @@ -3077,6 +4078,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3132,6 +4151,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -3197,6 +4234,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire @@ -3263,6 +4322,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel @@ -3329,6 +4410,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst @@ -3391,6 +4494,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3453,6 +4576,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3515,6 +4658,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3577,6 +4740,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3639,6 +4822,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3701,6 +4904,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3763,6 +4986,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3825,6 +5068,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3887,6 +5150,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3949,6 +5232,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4021,6 +5324,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4095,6 +5422,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4169,6 +5520,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4243,6 +5618,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4317,6 +5716,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4391,6 +5814,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4465,6 +5912,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4539,6 +6010,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -73,6 +75,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 @@ -151,6 +179,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -225,6 +279,32 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -300,6 +380,32 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(3)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -64,6 +66,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4 @@ -130,6 +156,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4 @@ -196,6 +246,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4 @@ -262,6 +336,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4 @@ -318,6 +416,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4 @@ -373,6 +491,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4 @@ -428,6 +566,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4 @@ -483,6 +641,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4 @@ -538,6 +716,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic @@ -593,6 +791,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -648,6 +866,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release @@ -703,6 +941,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -758,6 +1016,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -823,6 +1101,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire @@ -889,6 +1191,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel @@ -955,6 +1281,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst @@ -1017,6 +1367,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1079,6 +1451,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1141,6 +1535,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1203,6 +1619,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1265,6 +1703,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1327,6 +1787,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1389,6 +1871,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1451,6 +1955,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1513,6 +2039,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1575,6 +2123,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1647,6 +2217,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1721,6 +2317,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1795,6 +2417,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1869,6 +2517,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1943,6 +2617,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2017,6 +2717,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2091,6 +2817,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2165,6 +2917,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2233,6 +3011,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 @@ -2299,6 +3101,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 @@ -2365,6 +3191,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 @@ -2431,6 +3281,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -2487,6 +3361,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 @@ -2542,6 +3436,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 @@ -2597,6 +3511,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 @@ -2652,6 +3586,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -2707,6 +3661,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -2762,6 +3736,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -2817,6 +3811,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release @@ -2872,6 +3886,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -2927,6 +3961,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -2992,6 +4046,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire @@ -3058,6 +4136,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -3124,6 +4226,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -3186,6 +4312,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3248,6 +4396,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3310,6 +4480,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3372,6 +4564,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3434,6 +4648,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3496,6 +4732,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3558,6 +4816,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3620,6 +4900,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3682,6 +4984,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3744,6 +5068,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3816,6 +5162,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3890,6 +5262,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3964,6 +5362,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4038,6 +5462,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4112,6 +5562,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4186,6 +5662,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4260,6 +5762,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4334,6 +5862,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -64,6 +66,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4 @@ -130,6 +154,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4 @@ -197,6 +243,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4 @@ -270,6 +339,31 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4 @@ -326,6 +420,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4 @@ -381,6 +493,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4 @@ -442,6 +572,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out release, align 4 @@ -503,6 +653,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4 @@ -558,6 +728,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic @@ -619,6 +807,26 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire @@ -680,6 +888,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release @@ -747,6 +975,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel @@ -814,6 +1064,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst @@ -880,6 +1152,29 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire @@ -953,6 +1248,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel @@ -1026,6 +1346,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst @@ -1088,6 +1433,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1156,6 +1521,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1224,6 +1611,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1298,6 +1707,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1372,6 +1805,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1440,6 +1897,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1514,6 +1993,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1588,6 +2091,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1662,6 +2189,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1736,6 +2287,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1809,6 +2384,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1890,6 +2490,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1971,6 +2598,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2046,6 +2700,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2127,6 +2806,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2208,6 +2914,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2289,6 +3022,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2370,6 +3130,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2438,6 +3225,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 @@ -2504,6 +3313,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 @@ -2570,6 +3401,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 @@ -2636,6 +3489,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 @@ -2692,6 +3567,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 @@ -2747,6 +3640,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 @@ -2802,6 +3713,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 @@ -2857,6 +3786,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 @@ -2912,6 +3859,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic @@ -2967,6 +3932,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire @@ -3022,6 +4005,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release @@ -3077,6 +4078,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel @@ -3132,6 +4151,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst @@ -3197,6 +4234,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire @@ -3263,6 +4322,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel @@ -3329,6 +4410,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst @@ -3391,6 +4494,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3453,6 +4576,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3515,6 +4658,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3577,6 +4740,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3639,6 +4822,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3701,6 +4904,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3763,6 +4986,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3825,6 +5068,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3887,6 +5150,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3949,6 +5232,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4021,6 +5324,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4095,6 +5422,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4169,6 +5520,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4243,6 +5618,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4317,6 +5716,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4391,6 +5814,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4465,6 +5912,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4539,6 +6010,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -64,6 +66,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4 @@ -130,6 +156,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4 @@ -196,6 +246,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4 @@ -262,6 +336,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4 @@ -318,6 +416,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4 @@ -373,6 +491,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4 @@ -428,6 +566,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4 @@ -483,6 +641,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4 @@ -538,6 +716,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic @@ -593,6 +791,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -648,6 +866,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release @@ -703,6 +941,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -758,6 +1016,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -823,6 +1101,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire @@ -889,6 +1191,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel @@ -955,6 +1281,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst @@ -1017,6 +1367,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1079,6 +1451,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1141,6 +1535,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1203,6 +1619,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1265,6 +1703,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1327,6 +1787,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1389,6 +1871,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1451,6 +1955,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1513,6 +2039,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1575,6 +2123,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1647,6 +2217,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1721,6 +2317,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1795,6 +2417,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1869,6 +2517,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1943,6 +2617,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2017,6 +2717,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2091,6 +2817,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2165,6 +2917,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2233,6 +3011,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 @@ -2299,6 +3101,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 @@ -2365,6 +3191,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 @@ -2431,6 +3281,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -2487,6 +3361,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 @@ -2542,6 +3436,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 @@ -2597,6 +3511,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 @@ -2652,6 +3586,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -2707,6 +3661,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -2762,6 +3736,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -2817,6 +3811,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release @@ -2872,6 +3886,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -2927,6 +3961,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -2992,6 +4046,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire @@ -3058,6 +4136,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -3124,6 +4226,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -3186,6 +4312,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3248,6 +4396,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3310,6 +4480,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3372,6 +4564,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3434,6 +4648,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3496,6 +4732,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3558,6 +4816,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3620,6 +4900,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3682,6 +4984,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3744,6 +5068,28 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3816,6 +5162,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3890,6 +5262,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3964,6 +5362,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4038,6 +5462,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4112,6 +5562,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4186,6 +5662,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4260,6 +5762,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4334,6 +5862,32 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: @@ -64,6 +66,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4 @@ -130,6 +154,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4 @@ -197,6 +243,29 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 @@ -270,6 +339,31 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4 @@ -326,6 +420,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4 @@ -381,6 +493,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4 @@ -442,6 +572,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 @@ -503,6 +653,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4 @@ -558,6 +728,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic @@ -619,6 +807,26 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire @@ -680,6 +888,26 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release @@ -747,6 +975,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel @@ -814,6 +1064,28 @@ ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst @@ -880,6 +1152,29 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire @@ -953,6 +1248,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel @@ -1026,6 +1346,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst @@ -1088,6 +1433,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1156,6 +1521,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1224,6 +1611,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1298,6 +1707,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1372,6 +1805,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1440,6 +1897,28 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1514,6 +1993,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1588,6 +2091,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1662,6 +2189,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1736,6 +2287,30 @@ ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1809,6 +2384,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1890,6 +2490,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -1971,6 +2598,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2046,6 +2700,31 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2127,6 +2806,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2208,6 +2914,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2289,6 +3022,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2370,6 +3130,33 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -2438,6 +3225,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 @@ -2504,6 +3313,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 @@ -2570,6 +3401,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 @@ -2636,6 +3489,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v1, v0 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(3)* %out) { entry: %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -2692,6 +3567,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_unordered_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 @@ -2747,6 +3640,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 @@ -2802,6 +3713,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 @@ -2857,6 +3786,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { entry: store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -2912,6 +3859,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -2967,6 +3932,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3022,6 +4005,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release @@ -3077,6 +4078,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3132,6 +4151,24 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3197,6 +4234,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire @@ -3263,6 +4322,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -3329,6 +4410,28 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -3391,6 +4494,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3453,6 +4576,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3515,6 +4658,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3577,6 +4740,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3639,6 +4822,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3701,6 +4904,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3763,6 +4986,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3825,6 +5068,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3887,6 +5150,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -3949,6 +5232,26 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4021,6 +5324,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4095,6 +5422,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4169,6 +5520,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4243,6 +5618,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4317,6 +5716,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4391,6 +5814,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4465,6 +5912,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 @@ -4539,6 +6010,30 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NOTTGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-TGSPLIT-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 +; GFX90A-TGSPLIT-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir @@ -20,7 +20,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -44,7 +44,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -68,7 +68,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -92,7 +92,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -116,7 +116,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -140,7 +140,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -164,7 +164,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -188,7 +188,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -212,7 +212,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -236,7 +236,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -260,7 +260,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -284,7 +284,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -308,7 +308,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -332,7 +332,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -356,7 +356,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -380,7 +380,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -404,7 +404,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -428,7 +428,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -452,7 +452,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -476,7 +476,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -23,13 +23,13 @@ $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_CBRANCH_SCC0 %bb.1, implicit killed $scc bb.2: @@ -55,11 +55,11 @@ S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir @@ -117,13 +117,13 @@ $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -149,11 +149,11 @@ S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir @@ -42,7 +42,7 @@ # CHECK-LABEL: name: multiple_mem_operands # CHECK-LABEL: bb.3.done: -# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0 +# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0 name: multiple_mem_operands alignment: 1 @@ -97,13 +97,13 @@ $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -129,11 +129,11 @@ S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: @@ -91,6 +93,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen glc slc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 @@ -187,6 +223,40 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -280,6 +350,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %val = load i32, i32 addrspace(1)* %in, align 4 @@ -374,6 +478,40 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc ; SKIP-CACHE-INV-NEXT: s_endpgm +; +; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: +; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc +; GFX90A-NOTTGSPLIT-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: +; GFX90A-TGSPLIT: ; %bb.0: ; %entry +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 +; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir @@ -20,7 +20,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -44,7 +44,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -68,7 +68,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -92,7 +92,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -116,7 +116,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -140,7 +140,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -164,7 +164,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -188,7 +188,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -212,7 +212,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -236,7 +236,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -260,7 +260,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -284,7 +284,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -308,7 +308,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -332,7 +332,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -356,7 +356,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -380,7 +380,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -404,7 +404,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -428,7 +428,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -452,7 +452,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... @@ -476,7 +476,7 @@ renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`) $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/memory_clause.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -2,12 +2,12 @@ # GCN-LABEL: {{^}}name: vector_clause{{$}} # GCN: early-clobber %2:vreg_128, early-clobber %4:vreg_128, early-clobber %1:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec --- name: vector_clause @@ -21,24 +21,24 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec - %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %3, 32, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %4, 48, 0, 0, 0, implicit $exec + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %3, 32, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %4, 48, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: subreg_full{{$}} # GCN: early-clobber %1:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: internal %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec --- name: subreg_full @@ -49,20 +49,20 @@ body: | bb.0: %0 = IMPLICIT_DEF - undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec - %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec - %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec - %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec + undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec + %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec + %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: subreg_part{{$}} # GCN: undef early-clobber %1.sub0_sub1:vreg_128, undef early-clobber %1.sub3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec --- name: subreg_part @@ -73,18 +73,18 @@ body: | bb.0: %0 = IMPLICIT_DEF - undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec - %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec - %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec + undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, 0, implicit $exec + %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, 0, implicit $exec + %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: dead{{$}} # GCN: dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -99,18 +99,18 @@ body: | bb.0: %0 = IMPLICIT_DEF - dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec - dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec - dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec + dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec + dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec + dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: subreg_dead{{$}} # GCN: early-clobber %1:vreg_64 = BUNDLE %0, implicit $exec { -# GCN-NEXT: %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, 0, implicit $exec --- name: subreg_dead @@ -121,15 +121,15 @@ body: | bb.0: %0 = IMPLICIT_DEF - undef %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec - dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec + undef %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec + dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: kill{{$}} # GCN: early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, %1, implicit $exec { -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -144,17 +144,17 @@ bb.0: %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: indirect{{$}} -# GCN: %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, implicit $exec +# GCN: %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %1, implicit $exec { -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -168,18 +168,18 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, implicit $exec - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, implicit $exec + %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: stack{{$}} # GCN: %0:vreg_64 = IMPLICIT_DEF -# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec --- name: stack @@ -193,33 +193,33 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, implicit $exec - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, implicit $exec + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %stack.0, 16, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 16, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: overflow_counter{{$}} # GCN: dead early-clobber %7:vgpr_32, dead early-clobber %14:vgpr_32, dead early-clobber %2:vgpr_32, dead early-clobber %9:vgpr_32, dead early-clobber %4:vgpr_32, dead early-clobber %11:vgpr_32, dead early-clobber %6:vgpr_32, dead early-clobber %13:vgpr_32, dead early-clobber %1:vgpr_32, dead early-clobber %8:vgpr_32, dead early-clobber %15:vgpr_32, dead early-clobber %3:vgpr_32, dead early-clobber %10:vgpr_32, dead early-clobber %5:vgpr_32, dead early-clobber %12:vgpr_32 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } # GCN-NEXT: dead early-clobber %16:vgpr_32, dead early-clobber %17:vgpr_32 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -247,36 +247,36 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec - %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec - %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec - %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec - %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec - %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec - %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec - %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec - %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec - %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec - %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec - %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec - %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec - %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec - %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, 0, implicit $exec + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, 0, implicit $exec + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, 0, implicit $exec + %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, 0, implicit $exec + %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, 0, implicit $exec + %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, 0, implicit $exec + %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, 0, implicit $exec + %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, 0, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, 0, implicit $exec + %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, 0, implicit $exec + %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, 0, implicit $exec + %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, 0, implicit $exec + %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, 0, implicit $exec + %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: reg_pressure{{$}} # GCN: dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } # GCN-NEXT: dead early-clobber %7:vreg_128, dead early-clobber %6:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -294,22 +294,22 @@ body: | bb.0: %0 = IMPLICIT_DEF - %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec - %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec - %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec - %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec - %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec + %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, 0, implicit $exec + %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, 0, implicit $exec + %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: image_clause{{$}} # GCN: early-clobber %4:vreg_128, early-clobber %3:vreg_128, early-clobber %5:vreg_128 = BUNDLE %0, undef %2:sgpr_128, %1, implicit $exec { -# GCN-NEXT: %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } -# GCN-NEXT: IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: image_clause @@ -325,19 +325,19 @@ bb.0: %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF - %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + IMAGE_STORE_V4_V2 %4, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + IMAGE_STORE_V4_V2 %5, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) ... # GCN-LABEL: {{^}}name: mixed_clause{{$}} # GCN: dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec { -# GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -355,17 +355,17 @@ %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF %2 = IMPLICIT_DEF - %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: atomic{{$}} # GCN: %1:vgpr_32 = IMPLICIT_DEF -# GCN-NEXT: dead %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr -# GCN-NEXT: dead %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr -# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr -# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr +# GCN-NEXT: dead %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr +# GCN-NEXT: dead %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr +# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr +# GCN-NEXT: FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr # GCN-NEXT: S_ENDPGM 0 --- @@ -380,9 +380,31 @@ bb.0: %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF - %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr - %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, implicit $exec, implicit $flat_scr - FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr - FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr + %2:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr + %3:vgpr_32 = FLAT_ATOMIC_ADD_RTN %0, %1, 0, -1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_ADD %0, %1, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... + +# GCN-LABEL: {{^}}name: mem_clause_sreg256_used_stack{{$}} +# GCN: dead undef early-clobber %0.sub7:sgpr_256, dead undef early-clobber %0.sub3:sgpr_256 = BUNDLE undef %1:sgpr_64(p4) { +# GCN-NEXT: undef %0.sub7:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 8, 0, 0 +# GCN-NEXT: internal dead %0.sub3:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 24, 0, 0 +# GCN-NEXT: } +--- +name: mem_clause_sreg256_used_stack +stack: + - { id: 0, type: default, offset: 0, size: 40, alignment: 8 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + undef %0.sub7:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 8, 0, 0 + %0.sub3:sgpr_256 = S_LOAD_DWORD_IMM undef %1:sgpr_64(p4), 24, 0, 0 + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir +++ llvm/test/CodeGen/AMDGPU/merge-image-load-gfx10.mir @@ -13,7 +13,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -31,7 +31,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -50,7 +50,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... @@ -69,7 +69,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5:vreg_64, %3:sgpr_256, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... @@ -88,7 +88,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... @@ -107,7 +107,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... @@ -124,12 +124,12 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %9:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %7:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %11:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- @@ -146,9 +146,9 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - IMAGE_STORE_V4_V2 %4:vreg_128, %5:vreg_64, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + IMAGE_STORE_V4_V2 %4:vreg_128, %5:vreg_64, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- @@ -165,7 +165,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -183,7 +183,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 11, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -201,8 +201,8 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %6, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -221,7 +221,7 @@ %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %6, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %6, %4, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -239,7 +239,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -257,7 +257,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -275,7 +275,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -293,7 +293,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -311,7 +311,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -329,7 +329,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = COPY %5.sub0 %7:vgpr_32 = IMAGE_LOAD_V1_V1_gfx10 %6, %3, 8, 1, -1, 0, 0, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %8:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) @@ -348,7 +348,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -366,7 +366,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_LOAD_V2_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -384,7 +384,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_V1_V2_gfx10 %5, %3, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_V3_V2_gfx10 %5, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -403,7 +403,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -424,7 +424,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -445,7 +445,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V3_gfx10 %5:vreg_96, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V3_gfx10 %5:vreg_96, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -464,7 +464,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_PCK_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -483,7 +483,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V2_gfx10 %5:vreg_64, %3:sgpr_256, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V2_gfx10 %5:vreg_64, %3:sgpr_256, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... Index: llvm/test/CodeGen/AMDGPU/merge-image-load.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-image-load.mir +++ llvm/test/CodeGen/AMDGPU/merge-image-load.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_load_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -13,13 +13,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- + # GFX9-LABEL: name: image_load_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -31,14 +32,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v2v2 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 @@ -50,14 +51,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v2v2_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 @@ -69,14 +70,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_LOAD_V2_V4 %5:vreg_128, %3:sgpr_256, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v3v1 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 @@ -88,14 +89,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_load_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -107,14 +108,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_load_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) name: image_load_divided_merged body: | @@ -124,19 +125,19 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %9:vreg_96 = IMAGE_LOAD_V3_V4 %7:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %11:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %9:vreg_96 = IMAGE_LOAD_V3_V4 %7:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %11:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_divided_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_divided_not_merged body: | @@ -146,16 +147,16 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_dmask_overlapped_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_dmask_overlapped_not_merged body: | @@ -165,15 +166,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_dmask_not_disjoint_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_dmask_not_disjoint_not_merged body: | @@ -183,15 +184,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5:vreg_128, %3:sgpr_256, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5:vreg_128, %3:sgpr_256, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_0 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_0 body: | @@ -201,16 +202,16 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_1 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_1 body: | @@ -221,15 +222,15 @@ %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %7:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_LOAD_V1_V4 %6, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_LOAD_V3_V4 %6, %4, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_10 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_10 body: | @@ -239,15 +240,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_3 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_3 body: | @@ -257,15 +258,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_4 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_4 body: | @@ -275,15 +276,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_5 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_5 body: | @@ -293,15 +294,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_6 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_6 body: | @@ -311,15 +312,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_7 -# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_7 body: | @@ -329,15 +330,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_8 -# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_8 body: | @@ -347,15 +348,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_LOAD_V2_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_not_merged_9 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_load_not_merged_9 body: | @@ -365,14 +366,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_V1_V4 %5, %3, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_V3_V4 %5, %3, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_mip_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -384,16 +385,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_MIP_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - - # GFX9-LABEL: name: image_load_mip_pck_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -405,16 +404,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - - # GFX9-LABEL: name: image_load_mip_pck_sgn_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_MIP_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -426,14 +423,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_MIP_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_MIP_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_pck_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -445,14 +442,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_PCK_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_PCK_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_load_pck_sgn_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_LOAD_PCK_SGN_V4_V4 %5, %3, 15, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -464,8 +461,9 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_LOAD_PCK_SGN_V1_V4 %5:vreg_128, %3:sgpr_256, 1, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_LOAD_PCK_SGN_V3_V4 %5:vreg_128, %3:sgpr_256, 14, 0, 0, 0, 0, 0, 0, 0, -1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- + Index: llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir +++ llvm/test/CodeGen/AMDGPU/merge-image-sample-gfx10.mir @@ -13,7 +13,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -31,7 +31,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -50,7 +50,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... @@ -69,7 +69,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 12, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) %7:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 3, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... @@ -88,7 +88,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... @@ -107,7 +107,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... @@ -124,12 +124,12 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %9:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %7:vgpr_32, %7:vgpr_32, %7:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %11:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- @@ -146,7 +146,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) IMAGE_STORE_V4_V2_nsa_gfx10 %4:vreg_128, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, 15, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) @@ -165,7 +165,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -183,7 +183,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 4, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 11, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -201,8 +201,8 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -221,7 +221,7 @@ %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %6, %6, %6, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %4, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -240,7 +240,7 @@ %3:sgpr_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %6, %6, %6, %4, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %8:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %6, %6, %6, %4, %3, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -258,7 +258,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -276,7 +276,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -294,7 +294,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -312,7 +312,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -330,7 +330,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V2_nsa_gfx10 %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -348,7 +348,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -366,7 +366,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vreg_64 = IMAGE_SAMPLE_L_V2_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -384,7 +384,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V3_nsa_gfx10 %5, %5, %5, %3, %2, 8, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_V3_V3_nsa_gfx10 %5, %5, %5, %3, %2, 7, 1, -1, 0, 0, 0, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -406,7 +406,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_V1_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_V3_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -426,7 +426,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -446,7 +446,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -466,7 +466,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -486,7 +486,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -506,7 +506,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -526,7 +526,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -546,7 +546,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -566,7 +566,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -586,7 +586,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -606,7 +606,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -626,7 +626,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -646,7 +646,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -666,7 +666,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -686,7 +686,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -706,7 +706,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -726,7 +726,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -746,7 +746,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -766,7 +766,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -786,7 +786,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -806,7 +806,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -826,7 +826,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -846,7 +846,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -866,7 +866,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -886,7 +886,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V9_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -906,7 +906,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -926,7 +926,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -946,7 +946,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -966,7 +966,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -986,7 +986,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V5_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1006,7 +1006,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1026,7 +1026,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_V3_V6_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1046,7 +1046,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1066,7 +1066,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V8_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1086,7 +1086,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V7_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1106,7 +1106,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V2_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1126,7 +1126,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1146,7 +1146,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... @@ -1166,7 +1166,7 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) %7:vreg_96 = IMAGE_SAMPLE_O_V3_V3_nsa_gfx10 %5:vgpr_32, %5:vgpr_32, %5:vgpr_32, %3:sgpr_256, %2:sgpr_128, 14, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... Index: llvm/test/CodeGen/AMDGPU/merge-image-sample.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-image-sample.mir +++ llvm/test/CodeGen/AMDGPU/merge-image-sample.mir @@ -1,7 +1,7 @@ # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s # GFX9-LABEL: name: image_sample_l_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -13,13 +13,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- + # GFX9-LABEL: name: image_sample_l_merged_v1v3_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub3 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub0_sub1_sub2 @@ -31,14 +32,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v2v2 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub0_sub1 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub2_sub3 @@ -50,14 +51,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v2v2_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %8.sub2_sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %8.sub0_sub1 @@ -69,14 +70,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) - %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 12, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) + %7:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 3, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 8, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v3v1 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub3 @@ -88,14 +89,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_merged_v3v1_reversed -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %8.sub1_sub2_sub3 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %8.sub0 @@ -107,14 +108,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_divided_merged -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) name: image_sample_l_divided_merged body: | @@ -124,19 +125,19 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) - %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %8:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %9:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %7:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %10:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %11:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_divided_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_divided_not_merged body: | @@ -146,16 +147,16 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vreg_128 = COPY %2 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + IMAGE_STORE_V4_V4 %4:vreg_128, %5:vreg_128, %3:sgpr_256, 15, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_dmask_overlapped_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_dmask_overlapped_not_merged body: | @@ -165,15 +166,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_dmask_not_disjoint_not_merged -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_dmask_not_disjoint_not_merged body: | @@ -183,15 +184,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 4, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 11, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_0 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_0 body: | @@ -201,16 +202,16 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_1 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_1 body: | @@ -221,15 +222,15 @@ %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_2 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_2 body: | @@ -240,15 +241,15 @@ %3:sgpr_128 = COPY $sgpr92_sgpr93_sgpr94_sgpr95 %4:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %5:vgpr_32 = COPY %2.sub3 - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %7:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %6, %4, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %8:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %6, %4, %3, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_3 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_3 body: | @@ -258,15 +259,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_4 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_4 body: | @@ -276,15 +277,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_5 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_5 body: | @@ -294,15 +295,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_6 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_6 body: | @@ -312,15 +313,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 1, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_7 -# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_7 body: | @@ -330,15 +331,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, 0, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_8 -# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_8 body: | @@ -348,15 +349,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vreg_64 = IMAGE_SAMPLE_L_V2_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 1, -1, 0, implicit $exec :: (dereferenceable load 8, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_9 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_9 body: | @@ -366,15 +367,15 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- # GFX9-LABEL: name: image_sample_l_not_merged_10 -# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) -# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) name: image_sample_l_not_merged_10 body: | @@ -384,17 +385,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_V1_V4 %5, %3, %2, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_V3_V4 %5, %3, %2, 7, 0, 0, 0, 0, 0, 0, 0, -1, 1, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - - - # GFX9-LABEL: name: image_sample_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -406,15 +404,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_b_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -426,15 +423,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_b_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -446,15 +442,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_b_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -466,15 +461,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_b_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -486,15 +480,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -506,15 +499,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cd_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -526,15 +518,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cd_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -546,15 +537,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cd_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -566,15 +556,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cd_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -586,15 +575,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -606,15 +594,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -626,15 +613,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_b_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -646,15 +632,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_b_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -666,15 +651,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_b_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -686,15 +670,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_b_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_B_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -706,15 +689,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_B_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_B_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cd_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -726,15 +708,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CD_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CD_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cd_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -746,15 +727,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cd_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -766,15 +746,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CD_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CD_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cd_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CD_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -786,15 +765,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CD_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CD_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -806,15 +784,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -826,15 +803,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_d_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -846,15 +822,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_d_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -866,15 +841,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_d_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -886,15 +860,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_d_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -906,15 +879,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_l_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -926,15 +898,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_L_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_L_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_lz_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -946,15 +917,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_lz_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -966,15 +936,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_l_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -986,15 +955,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_c_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_C_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1006,15 +974,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_C_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_C_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_d_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1026,15 +993,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_D_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_d_cl_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1046,15 +1012,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_D_CL_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_CL_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_d_cl_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_CL_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1066,15 +1031,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_D_CL_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_CL_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_d_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_D_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1086,15 +1050,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_D_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_D_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_lz_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1106,15 +1069,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_LZ_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_lz_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_LZ_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1126,15 +1088,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_LZ_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_LZ_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_l_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_L_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1146,15 +1107,14 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_L_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_L_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- - # GFX9-LABEL: name: image_sample_o_merged_v1v3 -# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = IMAGE_SAMPLE_O_V4_V4 %5, %3, %2, 15, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %8.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %8.sub1_sub2_sub3 @@ -1166,8 +1126,8 @@ %2:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 %3:sgpr_256 = S_LOAD_DWORDX8_IMM %1, 208, 0, 0 %4:vgpr_32 = COPY %2.sub3 - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) - %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) - %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %2:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 16) + %6:vgpr_32 = IMAGE_SAMPLE_O_V1_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + %7:vreg_96 = IMAGE_SAMPLE_O_V3_V4 %5:vreg_128, %3:sgpr_256, %2:sgpr_128, 14, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (dereferenceable load 12, align 16, addrspace 4) ... --- Index: llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/merge-load-store-agpr.mir @@ -0,0 +1,94 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +# GCN-LABEL: name: ds_read_b32_v_v +# GCN: vreg_64 = DS_READ2_B32 +name: ds_read_b32_v_v +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) + %2:vgpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_read_b32_a_a +# GCN: areg_64 = DS_READ2_B32 +name: ds_read_b32_a_a +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + %1:agpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) + %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_read_b32_v_a +# GCN: vgpr_32 = DS_READ_B32 +# GCN: agpr_32 = DS_READ_B32 +name: ds_read_b32_v_a +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + %1:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) + %2:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_read_b32_a_v +# GCN: agpr_32 = DS_READ_B32 +# GCN: vgpr_32 = DS_READ_B32 +name: ds_read_b32_a_v +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + %1:agpr_32 = DS_READ_B32_gfx9 %0, 8, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) + %2:vgpr_32 = DS_READ_B32_gfx9 %0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_write_b32_v_v +# GCN: DS_WRITE2_B32_gfx9 %0, undef %1:vgpr_32, undef %2:vgpr_32 +name: ds_write_b32_v_v +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_write_b32_a_a +# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32 +# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32 +name: ds_write_b32_a_a +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_write_b32_v_a +# GCN: DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32 +# GCN: DS_WRITE_B32_gfx9 %0, undef %2:agpr_32 +name: ds_write_b32_v_a +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + DS_WRITE_B32_gfx9 %0, undef %1:vgpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32_gfx9 %0, undef %2:agpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) +... + +# GCN-LABEL: name: ds_write_b32_a_v +# GCN: DS_WRITE_B32_gfx9 %0, undef %1:agpr_32 +# GCN: DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32 +name: ds_write_b32_a_v +body: | + bb.0: + + %0:vgpr_32 = IMPLICIT_DEF + DS_WRITE_B32_gfx9 %0, undef %1:agpr_32, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) + DS_WRITE_B32_gfx9 %0, undef %2:vgpr_32, 8, 0, implicit $exec :: (store 4 into `i32 addrspace(3)* undef`) +... Index: llvm/test/CodeGen/AMDGPU/merge-load-store.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-load-store.mir +++ llvm/test/CodeGen/AMDGPU/merge-load-store.mir @@ -170,10 +170,10 @@ --- # CHECK-LABEL: merge_mmos # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 name: merge_mmos tracksRegLiveness: true body: | @@ -183,22 +183,22 @@ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4) - %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) - BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) - BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) + BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) + BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) S_ENDPGM 0 ... --- # CHECK-LABEL: reorder_offsets -# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1) -# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1) +# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1) +# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1) name: reorder_offsets tracksRegLiveness: true @@ -208,12 +208,12 @@ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4) - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8) - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12) - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16) - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20) - BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20) + BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1) S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -6,7 +6,7 @@ # # GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3 name: gfx9_tbuffer_load_x_xyz @@ -17,13 +17,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_xyz_x -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3 name: gfx9_tbuffer_load_xyz_x @@ -34,13 +34,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3 name: gfx9_tbuffer_load_xy_xy @@ -51,13 +51,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) - %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_x_xy -# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2 name: gfx9_tbuffer_load_x_xy @@ -68,13 +68,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_xy_x -# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2 name: gfx9_tbuffer_load_xy_x @@ -85,14 +85,14 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_x_x -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1 @@ -104,13 +104,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32 -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1 @@ -122,24 +122,24 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_float_32 -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -153,30 +153,30 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_sint_32 -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 91, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -190,30 +190,30 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_uint_32 -# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -227,15 +227,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -245,15 +245,15 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx9_tbuffer_load_not_merged_data_format_mismatch @@ -264,15 +264,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -282,15 +282,15 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx9_tbuffer_load_not_merged_num_format_mismatch body: | bb.0.entry: @@ -299,22 +299,22 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_store_x_xyz # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx9_tbuffer_store_x_xyz body: | bb.0.entry: @@ -329,8 +329,8 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) ... --- @@ -339,7 +339,7 @@ # GFX9-LABEL: name: gfx9_tbuffer_store_xyz_x # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx9_tbuffer_store_xyz_x body: | bb.0.entry: @@ -354,8 +354,8 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 - TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -363,7 +363,7 @@ # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx9_tbuffer_store_xy_xy body: | bb.0.entry: @@ -379,15 +379,15 @@ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_store_x_xy # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2 -# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_x_xy body: | bb.0.entry: @@ -402,15 +402,15 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_store_xy_x # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_xy_x body: | bb.0.entry: @@ -426,15 +426,15 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_store_x_x # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 -# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) name: gfx9_tbuffer_store_x_x body: | bb.0.entry: @@ -448,14 +448,14 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 -# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) name: gfx9_tbuffer_store_x_x_format_32_32_32_32 body: | bb.0.entry: @@ -469,8 +469,8 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -490,14 +490,14 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_float32 body: | bb.0.entry: @@ -516,15 +516,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -544,14 +544,14 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_sint32 body: | bb.0.entry: @@ -570,15 +570,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -598,14 +598,14 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx9_tbuffer_store_uint32 body: | bb.0.entry: @@ -624,15 +624,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -651,15 +651,15 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) name: gfx9_tbuffer_store_not_merged_data_format_mismatch body: | bb.0.entry: @@ -678,15 +678,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -705,15 +705,15 @@ # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1 # GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) name: gfx9_tbuffer_store_not_merged_num_format_mismatch body: | bb.0.entry: @@ -732,22 +732,22 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0 -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx9_tbuffer_load_not_merged_swizzled_0 body: | bb.0.entry: @@ -756,15 +756,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1 -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx9_tbuffer_load_not_merged_swizzled_1 body: | bb.0.entry: @@ -773,8 +773,8 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -784,7 +784,7 @@ # # GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX10: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3 name: gfx10_tbuffer_load_x_xyz @@ -795,13 +795,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3 name: gfx10_tbuffer_load_xyz_x @@ -812,13 +812,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) + %7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1 # GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3 name: gfx10_tbuffer_load_xy_xy @@ -829,13 +829,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) - %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_x_xy -# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2 name: gfx10_tbuffer_load_x_xy @@ -846,13 +846,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_xy_x -# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2 name: gfx10_tbuffer_load_xy_x @@ -863,14 +863,14 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_x_x -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1 @@ -882,13 +882,13 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32 -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1 @@ -900,24 +900,24 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_float_32 -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -931,30 +931,30 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_sint_32 -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 63, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -968,30 +968,30 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_uint_32 -# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 62, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1 -# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1 -# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2 # GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0 @@ -1005,15 +1005,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -1023,15 +1023,15 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx10_tbuffer_load_not_merged_data_format_mismatch @@ -1042,15 +1042,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -1060,15 +1060,15 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx10_tbuffer_load_not_merged_num_format_mismatch body: | bb.0.entry: @@ -1077,15 +1077,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- @@ -1094,7 +1094,7 @@ # GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx10_tbuffer_store_x_xyz body: | bb.0.entry: @@ -1109,8 +1109,8 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) ... --- @@ -1118,7 +1118,7 @@ # GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx10_tbuffer_store_xyz_x body: | bb.0.entry: @@ -1133,8 +1133,8 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 - TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1142,7 +1142,7 @@ # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) name: gfx10_tbuffer_store_xy_xy body: | bb.0.entry: @@ -1158,15 +1158,15 @@ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_store_x_xy # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2 -# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_x_xy body: | bb.0.entry: @@ -1181,15 +1181,15 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_store_xy_x # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_xy_x body: | bb.0.entry: @@ -1205,15 +1205,15 @@ %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 - TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_store_x_x # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 -# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) name: gfx10_tbuffer_store_x_x body: | bb.0.entry: @@ -1227,14 +1227,14 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1 -# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) name: gfx10_tbuffer_store_x_x_format_32_32_32_32 body: | bb.0.entry: @@ -1248,8 +1248,8 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1269,14 +1269,14 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_float32 body: | bb.0.entry: @@ -1295,15 +1295,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1323,14 +1323,14 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_sint32 body: | bb.0.entry: @@ -1349,15 +1349,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1377,14 +1377,14 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1 -# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2 # GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4) # GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1 # GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2 -# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4) name: gfx10_tbuffer_store_uint32 body: | bb.0.entry: @@ -1403,15 +1403,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1430,15 +1430,15 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) name: gfx10_tbuffer_store_not_merged_data_format_mismatch body: | bb.0.entry: @@ -1457,15 +1457,15 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- @@ -1484,15 +1484,15 @@ # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1 # GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0 # GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3 -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) -# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) +# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) name: gfx10_tbuffer_store_not_merged_num_format_mismatch body: | bb.0.entry: @@ -1511,22 +1511,22 @@ %1:sgpr_32 = COPY $sgpr1 %0:sgpr_32 = COPY $sgpr0 %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) - TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) + TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0 -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx10_tbuffer_load_not_merged_swizzled_0 body: | bb.0.entry: @@ -1535,15 +1535,15 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- # GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1 -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) -# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) +# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) name: gfx10_tbuffer_load_not_merged_swizzled_1 body: | bb.0.entry: @@ -1552,8 +1552,8 @@ %2:sgpr_32 = COPY $sgpr2 %3:sgpr_32 = COPY $sgpr3 %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 - %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) - %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) + %8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4) ... --- Index: llvm/test/CodeGen/AMDGPU/mfma-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,20 +1,26 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 +; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; Check that we do not copy agprs to vgprs and back inside the loop. ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] ; Final result should be read only once after the loop. -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { entry: @@ -39,16 +45,21 @@ ; 3 vgprs are needed to avoid wait states between writes. ; Check that we do not use 32 temp sgprs as well. -; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] +; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) { entry: @@ -69,17 +80,21 @@ ; GCN-LABEL: {{^}}test_mfma_loop_non_splat: -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} -; GCN-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} +; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} +; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) { entry: @@ -103,78 +118,147 @@ ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. ; 3 vgprs are needed to avoid wait states between writes. -; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 -; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 -; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] -; GCN: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} -; GCN: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 +; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 +; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] +; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} +; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] + +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} +; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] + ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) { entry: @@ -198,12 +282,15 @@ ; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) { entry: @@ -259,16 +346,21 @@ ; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: -; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] +; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) { entry: @@ -322,47 +414,53 @@ ; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 -; GCN-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} -; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 +; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} + +; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 +; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) { entry: @@ -388,17 +486,24 @@ ; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX90A-NOT: v_accvgpr +; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} +; GFX90A-NOT: v_accvgpr +; GCN-NOT: v_accvgpr ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] -; GCN-COUNT-32: v_accvgpr_read_b32 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) { entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -420,21 +525,30 @@ ; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 -; GCN: v_mfma_f32_32x32x1f32 +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] +; GFX90A-NOT: v_accvgpr +; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} +; GFX90A-NOT: v_accvgpr ; Check that we are using only one tmp VGPR. -; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} -; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} +; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} +; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}} +; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; GCN: [[LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[LOOP]] + +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] -; GCN-COUNT-32: v_accvgpr_read_b32 define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) { entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) @@ -489,23 +603,28 @@ ; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: -; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} +; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 +; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] ; Check that we do not copy agprs to vgprs and back in an outer loop. ; GCN: [[OUTER_LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr +; GCN-NOT: v_accvgpr ; GCN: [[INNER_LOOP:BB[0-9_]+]]: -; GCN-NOT: v_accvgpr -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[INNER_LOOP]] -; GCN-NOT: v_accvgpr -; GCN: s_cbranch_scc1 [[OUTER_LOOP]] +; GCN-NOT: v_accvgpr +; GFX908_A: v_mfma_f32_32x32x1f32 +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[INNER_LOOP]] +; GCN-NOT: v_accvgpr +; GCN: s_cbranch_scc1 [[OUTER_LOOP]] ; Final result should be read only once after the loop. -; GCN-COUNT-32: v_accvgpr_read_b32 +; GFX908-COUNT-32: v_accvgpr_read_b32 +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] +; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) { entry: Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -28,7 +28,7 @@ # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL: bb.2: @@ -52,7 +52,7 @@ # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -77,7 +77,7 @@ %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -101,7 +101,7 @@ # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL: bb.2: @@ -125,7 +125,7 @@ # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -150,7 +150,7 @@ %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -174,7 +174,7 @@ # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL: bb.2: @@ -198,7 +198,7 @@ # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -223,7 +223,7 @@ %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -239,7 +239,7 @@ # ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_CO_U32_e64 %14.sub0, %4.sub0, 0, implicit $exec # ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec # ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: addr64 liveins: @@ -259,7 +259,7 @@ %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -284,7 +284,7 @@ # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64-NO-ADDR64: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-NO-ADDR64-LABEL: bb.2: @@ -306,7 +306,7 @@ # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[SRSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[SRSRC0]], %subreg.sub0, [[SRSRC1]], %subreg.sub1, [[SRSRC2]], %subreg.sub2, [[SRSRC3]], %subreg.sub3 # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -320,7 +320,7 @@ # ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 # ADDR64: [[ZERORSRC:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3 # ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: offset @@ -341,7 +341,7 @@ %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 Index: llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ llvm/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -9,7 +9,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_off1 @@ -20,7 +20,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_far @@ -33,7 +33,7 @@ bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) V_NOP_e32 implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec ... # Non-NSA @@ -45,7 +45,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec ... # Less than 4 dwords @@ -57,5 +57,5 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -137,7 +137,7 @@ %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -243,7 +243,7 @@ %37 = REG_SEQUENCE %6, 17, killed %36, 18 %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 - BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -332,7 +332,7 @@ %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -151,7 +151,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -159,7 +159,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -188,7 +188,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -196,7 +196,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -225,7 +225,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -233,14 +233,14 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc -# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH @@ -255,7 +255,7 @@ $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec $vgpr0 = V_MOV_B32_e32 4, implicit $exec $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc $exec = S_MOV_B64_term killed $sgpr2_sgpr3 SI_MASK_BRANCH %bb.2, implicit $exec @@ -266,7 +266,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -274,7 +274,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -304,7 +304,7 @@ bb.1.if: liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7 @@ -312,7 +312,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -346,7 +346,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -356,7 +356,7 @@ $sgpr1 = S_MOV_B32 1 $sgpr2 = S_MOV_B32 -1 $sgpr3 = S_MOV_B32 61440 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,7 +387,7 @@ S_SLEEP 0, implicit $sgpr2_sgpr3 $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -395,7 +395,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -426,7 +426,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -434,7 +434,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -463,7 +463,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -471,7 +471,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -500,7 +500,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -508,7 +508,7 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -539,7 +539,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -547,6 +547,6 @@ $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/packed-fp32.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -0,0 +1,580 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: {{^}}fadd_v2_vv: +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +define amdgpu_kernel void @fadd_v2_vv(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, %load + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_vs: +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fadd_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, %x + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v4_vs: +; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fadd_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id + %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 + %add = fadd <4 x float> %load, %x + store <4 x float> %add, <4 x float> addrspace(1)* %gep, align 16 + ret void +} + +; GCN-LABEL: {{^}}fadd_v32_vs: +; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id + %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 + %add = fadd <32 x float> %load, %x + store <32 x float> %add, <32 x float> addrspace(1)* %gep, align 128 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_imm: +; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_v_splat: +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fadd_v2_v_v_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fid = bitcast i32 %id to float + %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 + %k = insertelement <2 x float> %tmp1, float %fid, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: +; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 0 +; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 1.0 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}} +define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0 +; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[LO]]:[[HI]]]{{$}} +define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} +; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 1.0 +; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 2.0 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %add = fadd <2 x float> %load, + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_fneg: +; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} +define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 + %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}} +define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 + %k = insertelement <2 x float> %tmp1, float %x, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} +define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %x, i64 0 + %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}} +define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, float %x, float %y) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 + %k = insertelement <2 x float> %tmp1, float %y, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2: +; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} +define amdgpu_kernel void @fadd_v2_v_fneg_hi2(<2 x float> addrspace(1)* %a, float %x, float %y) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %y, i64 0 + %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 + %add = fadd <2 x float> %load, %k + store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_vv: +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +define amdgpu_kernel void @fmul_v2_vv(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %mul = fmul <2 x float> %load, %load + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_vs: +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fmul_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %mul = fmul <2 x float> %load, %x + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v4_vs: +; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fmul_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id + %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 + %mul = fmul <4 x float> %load, %x + store <4 x float> %mul, <4 x float> addrspace(1)* %gep, align 16 + ret void +} + +; GCN-LABEL: {{^}}fmul_v32_vs: +; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id + %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 + %mul = fmul <32 x float> %load, %x + store <32 x float> %mul, <32 x float> addrspace(1)* %gep, align 128 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_v_imm: +; GCN: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s[[K]], v{{[0-9]+}} +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %mul = fmul <2 x float> %load, + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_v_v_splat: +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fmul_v2_v_v_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fid = bitcast i32 %id to float + %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 + %k = insertelement <2 x float> %tmp1, float %fid, i64 1 + %mul = fmul <2 x float> %load, %k + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: +; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %mul = fmul <2 x float> %load, + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit: +; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}} +; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 +; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %mul = fmul <2 x float> %load, + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fmul_v2_v_fneg: +; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}} +; GFX90A: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} +define amdgpu_kernel void @fmul_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 + %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 + %mul = fmul <2 x float> %load, %k + store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_vv: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +define amdgpu_kernel void @fma_v2_vv(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_vs: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fma_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v4_vs: +; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX90A-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fma_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id + %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16 + %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x) + store <4 x float> %fma, <4 x float> addrspace(1)* %gep, align 16 + ret void +} + +; GCN-LABEL: {{^}}fma_v32_vs: +; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX90A-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id + %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128 + %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x) + store <32 x float> %fma, <32 x float> addrspace(1)* %gep, align 128 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_v_imm: +; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000 +; GCN-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]] +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s{{\[}}[[K1]]:{{[0-9:]+}}], v{{\[}}[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}} +define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> , <2 x float> ) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_v_v_splat: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0 +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}} +define amdgpu_kernel void @fma_v2_v_v_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fid = bitcast i32 %id to float + %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 + %k = insertelement <2 x float> %tmp1, float %fid, i64 1 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_v_lit_splat: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} +define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> , <2 x float> ) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit: +; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 +; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 +; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0 +; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 +; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 +; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} +define amdgpu_kernel void @fma_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> , <2 x float> ) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fma_v2_v_fneg: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}} +define amdgpu_kernel void @fma_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub float -0.0, %x + %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 + %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 + %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) + store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: +; GFX900-COUNT-2: v_sub_f32_e32 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] +define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) { +bb: + %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4 + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + %neg.scalar0 = fsub float -0.0, %scalar0 + + %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0 + %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer + + %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: +; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GFX90A: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1] +define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) { +bb: + %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 + %arg2.gep = getelementptr inbounds float, float addrspace(3)* %arg2, i32 2 + + %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 4 + + %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4 + %scalar1 = load volatile float, float addrspace(3)* %arg2.gep, align 4 + + %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0 + %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1 + %neg.vec2 = fsub <2 x float> , %vec2 + + %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2) + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}shuffle_add_f32: +; GFX900-COUNT-2: v_add_f32_e32 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @shuffle_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8 + %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 + %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8 + %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> + %result = fadd <2 x float> %vec0, %vec1.swap + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}shuffle_neg_add_f32: +; GFX900-COUNT-2: v_sub_f32_e32 +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} +define amdgpu_kernel void @shuffle_neg_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8 + %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1 + %f32 = load volatile float, float addrspace(3)* undef, align 8 + %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8 + %vec1.neg = fsub <2 x float> , %vec1 + %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> + %result = fadd <2 x float> %vec0, %vec1.neg.swap + store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_fadd_fsub: +; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 +; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0] +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0] +define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) { +bb: + %i12 = fadd <2 x float> zeroinitializer, %arg + %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> + %i13 = fadd <2 x float> zeroinitializer, %shift8 + %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> + %i15 = fsub <2 x float> %i14, zeroinitializer + store <2 x float> %i15, <2 x float>* undef + ret void +} + +; GCN-LABEL: {{^}}fadd_shuffle_v4: +; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] +define amdgpu_kernel void @fadd_shuffle_v4(<4 x float> addrspace(1)* %arg) { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %gep + %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer + %add.1 = fadd <4 x float> %in.1, %shuf + store <4 x float> %add.1, <4 x float> addrspace(1)* %gep + ret void +} + +; GCN-LABEL: {{^}}fneg_v2f32_vec: +; GFX900: s_brev_b32 [[SIGN:s[0-9]+]], 1 +; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, [[SIGN]], v{{[0-9]+}} +; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} +define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id + %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8 + %fneg = fsub <2 x float> , %load + store <2 x float> %fneg, <2 x float> addrspace(1)* %gep, align 8 + ret void +} + +; GCN-LABEL: {{^}}fneg_v2f32_scalar: +; GCN: s_brev_b32 [[SIGN:s[0-9]+]], 1 +; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]] +define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) { + %fneg = fsub <2 x float> , %x + store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) Index: llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir +++ llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir @@ -59,17 +59,17 @@ ; MUBUF-V2A: liveins: $agpr0 ; MUBUF-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 - ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v2_partial_agpr ; FLATSCR-V2A: liveins: $agpr0 ; FLATSCR-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 - ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 - ; FLATSCR-V2A: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR-V2A: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1 = IMPLICIT_DEF SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) @@ -93,19 +93,19 @@ ; MUBUF-V2A: liveins: $agpr0 ; MUBUF-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 - ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF-V2A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-V2A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-V2A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v3_partial_agpr ; FLATSCR-V2A: liveins: $agpr0 ; FLATSCR-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 - ; FLATSCR-V2A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr1_vgpr2, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 8 into %stack.0 + 4, align 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr1_vgpr2, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 8 into %stack.0 + 4, align 4, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 - ; FLATSCR-V2A: $vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 8 from %stack.0 + 4, align 4, addrspace 5) + ; FLATSCR-V2A: $vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 8 from %stack.0 + 4, align 4, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) @@ -131,11 +131,11 @@ ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v4_partial_agpr ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2 @@ -143,11 +143,11 @@ ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) @@ -173,13 +173,13 @@ ; MUBUF-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 - ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-V2A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v5_partial_agpr ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2 @@ -187,13 +187,13 @@ ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 - ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) - ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 - ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) - ; FLATSCR-V2A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR-V2A: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR-V2A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF SI_SPILL_V160_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) @@ -221,13 +221,13 @@ ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; MUBUF-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v6_partial_agpr ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 @@ -237,13 +237,13 @@ ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORD_SADDR killed $vgpr5, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 - ; FLATSCR-V2A: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) + ; FLATSCR-V2A: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) @@ -270,18 +270,18 @@ ; MUBUF-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5) - ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5) - ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5) - ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-V2A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v8_partial_agpr ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3 @@ -290,12 +290,12 @@ ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; FLATSCR-V2A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-V2A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF SI_SPILL_V256_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) @@ -323,33 +323,33 @@ ; MUBUF-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5) - ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF-V2A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5) ; MUBUF-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; MUBUF-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5) - ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5) - ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5) - ; MUBUF-V2A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5) - ; MUBUF-V2A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5) - ; MUBUF-V2A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5) - ; MUBUF-V2A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5) - ; MUBUF-V2A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5) - ; MUBUF-V2A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5) - ; MUBUF-V2A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5) - ; MUBUF-V2A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF-V2A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-V2A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-V2A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-V2A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF-V2A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF-V2A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF-V2A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF-V2A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF-V2A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF-V2A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF-V2A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5) ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v16_partial_agpr ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 @@ -359,17 +359,17 @@ ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; FLATSCR-V2A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr5_vgpr6_vgpr7, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 12 into %stack.0 + 20, align 4, addrspace 5) - ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr5_vgpr6_vgpr7, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 12 into %stack.0 + 20, align 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-V2A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; FLATSCR-V2A: $vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 12 from %stack.0 + 20, align 4, addrspace 5) - ; FLATSCR-V2A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR-V2A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-V2A: $vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 12 from %stack.0 + 20, align 4, addrspace 5) + ; FLATSCR-V2A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-V2A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) ; FLATSCR-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/pei-build-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-build-spill.mir +++ llvm/test/CodeGen/AMDGPU/pei-build-spill.mir @@ -3,6 +3,10 @@ # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-V2A %s # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR %s # RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=MUBUF-GFX90A-V2A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=FLATSCR-GFX90A-V2A %s --- name: test_spill_v1 @@ -17,8 +21,8 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v1 ; MUBUF: $vgpr0 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v1 ; MUBUF-V2A: liveins: $agpr0 @@ -28,8 +32,8 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v1 ; FLATSCR: $vgpr0 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v1 ; FLATSCR-V2A: liveins: $agpr0 @@ -37,6 +41,28 @@ ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v1 + ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v1 + ; MUBUF-GFX90A-V2A: liveins: $agpr0 + ; MUBUF-GFX90A-V2A: $vgpr0 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v1 + ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v1 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0 + ; FLATSCR-GFX90A-V2A: $vgpr0 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0 = IMPLICIT_DEF SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, align 4, addrspace 5) $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) @@ -57,10 +83,10 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v2 ; MUBUF: $vgpr0_vgpr1 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v2 ; MUBUF-V2A: liveins: $agpr0, $agpr1 @@ -72,8 +98,8 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v2 ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v2 ; FLATSCR-V2A: liveins: $agpr0, $agpr1 @@ -83,6 +109,34 @@ ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v2 + ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v2 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v2 + ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v2 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1 = IMPLICIT_DEF SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) @@ -103,12 +157,12 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v3 ; MUBUF: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v3 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2 @@ -122,8 +176,8 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v3 ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v3 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2 @@ -135,6 +189,40 @@ ; FLATSCR-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v3 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v3 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v3 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v3 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF SI_SPILL_V96_SAVE killed $vgpr0_vgpr1_vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2 = SI_SPILL_V96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) @@ -155,14 +243,14 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v4 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v4 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3 @@ -178,8 +266,8 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v4 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v4 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3 @@ -193,6 +281,46 @@ ; FLATSCR-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v4 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v4 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v4 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v4 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) @@ -213,16 +341,16 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v5 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v5 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 @@ -240,10 +368,10 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v5 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v5 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 @@ -259,6 +387,54 @@ ; FLATSCR-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v5 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v5 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v5 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v5 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF SI_SPILL_V160_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = SI_SPILL_V160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) @@ -279,18 +455,18 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v6 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v6 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5 @@ -310,10 +486,10 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v6 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v6 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5 @@ -331,6 +507,60 @@ ; FLATSCR-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v6 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v6 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v6 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr4_vgpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v6 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF SI_SPILL_V192_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) @@ -351,22 +581,22 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v8 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5) - ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5) - ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v8 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7 @@ -390,10 +620,10 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v8 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v8 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7 @@ -415,6 +645,72 @@ ; FLATSCR-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v8 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v8 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v8 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v8 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF SI_SPILL_V256_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) @@ -435,38 +731,38 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v16 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5) - ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5) - ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5) - ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5) - ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5) - ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5) - ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5) - ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5) - ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5) - ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5) - ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v16 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15 @@ -506,14 +802,14 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v16 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v16 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15 @@ -551,6 +847,124 @@ ; FLATSCR-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v16 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v16 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v16 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v16 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) @@ -571,70 +985,70 @@ bb.0.entry: ; MUBUF-LABEL: name: test_spill_v32 ; MUBUF: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5) - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5) - ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5) - ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5) - ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5) - ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5) - ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5) - ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5) - ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5) - ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5) - ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5) - ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5) - ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5) - ; MUBUF: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5) - ; MUBUF: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5) - ; MUBUF: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5) - ; MUBUF: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5) - ; MUBUF: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5) - ; MUBUF: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5) - ; MUBUF: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5) - ; MUBUF: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5) - ; MUBUF: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5) - ; MUBUF: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5) - ; MUBUF: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5) - ; MUBUF: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5) - ; MUBUF: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5) - ; MUBUF: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5) - ; MUBUF: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5) - ; MUBUF: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5) + ; MUBUF: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5) + ; MUBUF: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5) + ; MUBUF: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5) + ; MUBUF: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5) + ; MUBUF: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5) + ; MUBUF: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5) + ; MUBUF: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5) + ; MUBUF: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5) + ; MUBUF: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5) + ; MUBUF: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5) + ; MUBUF: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5) + ; MUBUF: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5) + ; MUBUF: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5) + ; MUBUF: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5) + ; MUBUF: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5) ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_v32 ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31 @@ -706,22 +1120,22 @@ ; MUBUF-V2A: S_ENDPGM 0 ; FLATSCR-LABEL: name: test_spill_v32 ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5) - ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5) - ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5) - ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) - ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) - ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) - ; FLATSCR: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5) - ; FLATSCR: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5) - ; FLATSCR: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5) - ; FLATSCR: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5) ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_v32 ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31 @@ -791,6 +1205,228 @@ ; FLATSCR-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; FLATSCR-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_v32 + ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 64, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 68, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 72, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 76, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 80, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 84, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 88, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 92, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 96, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 100, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 104, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 108, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 112, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 116, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 120, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $vgpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 4 into %stack.0 + 124, addrspace 5) + ; MUBUF-GFX90A: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 64, addrspace 5) + ; MUBUF-GFX90A: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 68, addrspace 5) + ; MUBUF-GFX90A: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 72, addrspace 5) + ; MUBUF-GFX90A: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 76, addrspace 5) + ; MUBUF-GFX90A: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 80, addrspace 5) + ; MUBUF-GFX90A: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 84, addrspace 5) + ; MUBUF-GFX90A: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 88, addrspace 5) + ; MUBUF-GFX90A: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 92, addrspace 5) + ; MUBUF-GFX90A: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 96, addrspace 5) + ; MUBUF-GFX90A: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 100, addrspace 5) + ; MUBUF-GFX90A: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 104, addrspace 5) + ; MUBUF-GFX90A: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 108, addrspace 5) + ; MUBUF-GFX90A: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 112, addrspace 5) + ; MUBUF-GFX90A: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 116, addrspace 5) + ; MUBUF-GFX90A: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 120, addrspace 5) + ; MUBUF-GFX90A: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 4 from %stack.0 + 124, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v32 + ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31 + ; MUBUF-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr24, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr25, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr26, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr27, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr28, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr29, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr31, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 $agpr16, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 $agpr17, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 $agpr18, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 $agpr19, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 $agpr20, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 $agpr21, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 $agpr22, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 $agpr23, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 $agpr24, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 $agpr25, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 $agpr26, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 $agpr27, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr28, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_v32 + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr8_vgpr9_vgpr10_vgpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr12_vgpr13_vgpr14_vgpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr16_vgpr17_vgpr18_vgpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr20_vgpr21_vgpr22_vgpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr24_vgpr25_vgpr26_vgpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr28_vgpr29_vgpr30_vgpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr16_vgpr17_vgpr18_vgpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr20_vgpr21_vgpr22_vgpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR-GFX90A: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v32 + ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr16, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr17, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr18, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr19, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr20, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr21, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr22, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr23, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr24, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr25, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr26, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr27, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr28, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr29, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr30, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr31, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 $agpr16, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 $agpr17, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 $agpr18, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 $agpr19, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 $agpr20, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 $agpr21, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 $agpr22, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 $agpr23, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 $agpr24, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 $agpr25, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 $agpr26, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 $agpr27, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr28, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr29, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr30, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF SI_SPILL_V1024_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = SI_SPILL_V1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) @@ -812,8 +1448,8 @@ ; MUBUF-LABEL: name: test_spill_a1 ; MUBUF: $agpr0 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a1 @@ -825,8 +1461,8 @@ ; FLATSCR-LABEL: name: test_spill_a1 ; FLATSCR: $agpr0 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a1 @@ -835,6 +1471,28 @@ ; FLATSCR-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a1 + ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a1 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0 + ; MUBUF-GFX90A-V2A: $agpr0 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a1 + ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR-GFX90A: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a1 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0 + ; FLATSCR-GFX90A-V2A: $agpr0 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0 = IMPLICIT_DEF SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, align 4, addrspace 5) $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, align 4, addrspace 5) @@ -856,12 +1514,12 @@ ; MUBUF-LABEL: name: test_spill_a2 ; MUBUF: $agpr0_agpr1 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a2 @@ -875,12 +1533,12 @@ ; FLATSCR-LABEL: name: test_spill_a2 ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a2 @@ -891,6 +1549,34 @@ ; FLATSCR-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a2 + ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a2 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a2 + ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 8 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 8 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a2 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1 = IMPLICIT_DEF SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) @@ -912,16 +1598,16 @@ ; MUBUF-LABEL: name: test_spill_a3 ; MUBUF: $agpr0_agpr1_agpr2 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a3 @@ -937,16 +1623,16 @@ ; FLATSCR-LABEL: name: test_spill_a3 ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a3 @@ -959,6 +1645,40 @@ ; FLATSCR-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2 ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a3 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a3 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a3 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 12 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 12 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a3 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2 = IMPLICIT_DEF SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) @@ -980,20 +1700,20 @@ ; MUBUF-LABEL: name: test_spill_a4 ; MUBUF: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a4 @@ -1011,20 +1731,20 @@ ; FLATSCR-LABEL: name: test_spill_a4 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a4 @@ -1039,6 +1759,46 @@ ; FLATSCR-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a4 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a4 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a4 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a4 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) @@ -1060,24 +1820,24 @@ ; MUBUF-LABEL: name: test_spill_a5 ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a5 @@ -1097,24 +1857,24 @@ ; FLATSCR-LABEL: name: test_spill_a5 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a5 @@ -1131,6 +1891,54 @@ ; FLATSCR-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a5 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a5 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a5 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORD_SADDR killed $agpr4, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a5 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) @@ -1152,28 +1960,28 @@ ; MUBUF-LABEL: name: test_spill_a6 ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a6 @@ -1195,28 +2003,28 @@ ; FLATSCR-LABEL: name: test_spill_a6 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a6 @@ -1235,6 +2043,60 @@ ; FLATSCR-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; FLATSCR-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a6 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a6 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a6 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX2_SADDR killed $agpr4_agpr5, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store 8 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load 8 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a6 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) @@ -1256,36 +2118,36 @@ ; MUBUF-LABEL: name: test_spill_a8 ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a8 @@ -1311,36 +2173,36 @@ ; FLATSCR-LABEL: name: test_spill_a8 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a8 @@ -1363,6 +2225,72 @@ ; FLATSCR-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; FLATSCR-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a8 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a8 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a8 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a8 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) @@ -1384,68 +2312,68 @@ ; MUBUF-LABEL: name: test_spill_a16 ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5) ; MUBUF: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5) ; MUBUF: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5) ; MUBUF: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5) ; MUBUF: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5) ; MUBUF: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5) ; MUBUF: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5) ; MUBUF: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5) ; MUBUF: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a16 @@ -1487,68 +2415,68 @@ ; FLATSCR-LABEL: name: test_spill_a16 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5) ; FLATSCR: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5) ; FLATSCR: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5) ; FLATSCR: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5) ; FLATSCR: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5) ; FLATSCR: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5) ; FLATSCR: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5) ; FLATSCR: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5) ; FLATSCR: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a16 @@ -1587,6 +2515,124 @@ ; FLATSCR-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; FLATSCR-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a16 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: $agpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: $agpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a16 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a16 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr8_agpr9_agpr10_agpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr12_agpr13_agpr14_agpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a16 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) @@ -1608,132 +2654,132 @@ ; MUBUF-LABEL: name: test_spill_a32 ; MUBUF: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5) ; MUBUF: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec - ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5) - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; MUBUF: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; MUBUF: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 4, addrspace 5) ; MUBUF: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 8, addrspace 5) ; MUBUF: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 12, addrspace 5) ; MUBUF: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 16, addrspace 5) ; MUBUF: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 20, addrspace 5) ; MUBUF: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 24, addrspace 5) ; MUBUF: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 28, addrspace 5) ; MUBUF: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 32, addrspace 5) ; MUBUF: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 36, addrspace 5) ; MUBUF: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 40, addrspace 5) ; MUBUF: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 44, addrspace 5) ; MUBUF: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 48, addrspace 5) ; MUBUF: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 52, addrspace 5) ; MUBUF: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 56, addrspace 5) ; MUBUF: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 60, addrspace 5) ; MUBUF: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 64, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 64, addrspace 5) ; MUBUF: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 68, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 68, addrspace 5) ; MUBUF: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 72, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 72, addrspace 5) ; MUBUF: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 76, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 76, addrspace 5) ; MUBUF: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 80, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 80, addrspace 5) ; MUBUF: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 84, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 84, addrspace 5) ; MUBUF: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 88, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 88, addrspace 5) ; MUBUF: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 92, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 92, addrspace 5) ; MUBUF: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 96, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 96, addrspace 5) ; MUBUF: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 100, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 100, addrspace 5) ; MUBUF: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 104, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 104, addrspace 5) ; MUBUF: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 108, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 108, addrspace 5) ; MUBUF: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 112, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 112, addrspace 5) ; MUBUF: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 116, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 116, addrspace 5) ; MUBUF: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 120, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 120, addrspace 5) ; MUBUF: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 124, addrspace 5) + ; MUBUF: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0 + 124, addrspace 5) ; MUBUF: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; MUBUF: S_ENDPGM 0 ; MUBUF-V2A-LABEL: name: test_spill_a32 @@ -1807,132 +2853,132 @@ ; FLATSCR-LABEL: name: test_spill_a32 ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 68, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 68, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 72, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 72, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 76, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 76, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 84, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 84, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 88, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 88, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 92, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 92, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 100, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 104, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 104, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 108, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 108, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 116, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 116, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 120, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 120, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5) ; FLATSCR: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec - ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 124, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5) - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + ; FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 124, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; FLATSCR: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 4, addrspace 5) ; FLATSCR: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 8, addrspace 5) ; FLATSCR: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 12, addrspace 5) ; FLATSCR: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 16, addrspace 5) ; FLATSCR: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 20, addrspace 5) ; FLATSCR: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 24, addrspace 5) ; FLATSCR: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 28, addrspace 5) ; FLATSCR: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 32, addrspace 5) ; FLATSCR: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 36, addrspace 5) ; FLATSCR: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 40, addrspace 5) ; FLATSCR: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 44, addrspace 5) ; FLATSCR: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 48, addrspace 5) ; FLATSCR: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 52, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 52, addrspace 5) ; FLATSCR: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 56, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 56, addrspace 5) ; FLATSCR: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 60, addrspace 5) ; FLATSCR: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 64, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 64, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 64, addrspace 5) ; FLATSCR: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 68, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 68, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 68, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 68, addrspace 5) ; FLATSCR: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 72, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 72, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 72, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 72, addrspace 5) ; FLATSCR: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 76, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 76, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 76, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 76, addrspace 5) ; FLATSCR: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 80, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 80, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 80, addrspace 5) ; FLATSCR: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 84, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 84, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 84, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 84, addrspace 5) ; FLATSCR: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 88, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 88, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 88, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 88, addrspace 5) ; FLATSCR: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 92, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 92, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 92, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 92, addrspace 5) ; FLATSCR: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 96, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 96, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 96, addrspace 5) ; FLATSCR: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 100, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 100, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 100, addrspace 5) ; FLATSCR: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 104, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 104, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 104, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 104, addrspace 5) ; FLATSCR: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 108, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 108, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 108, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 108, addrspace 5) ; FLATSCR: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 112, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 112, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 112, addrspace 5) ; FLATSCR: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 116, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 116, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 116, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 116, addrspace 5) ; FLATSCR: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 120, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 120, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 120, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 120, addrspace 5) ; FLATSCR: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 124, addrspace 5) + ; FLATSCR: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0 + 124, addrspace 5) ; FLATSCR: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; FLATSCR: S_ENDPGM 0 ; FLATSCR-V2A-LABEL: name: test_spill_a32 @@ -2003,6 +3049,228 @@ ; FLATSCR-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; FLATSCR-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; FLATSCR-V2A: S_ENDPGM 0 + ; MUBUF-GFX90A-LABEL: name: test_spill_a32 + ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr9, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr12, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr13, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr16, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 64, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr17, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 68, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr18, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 72, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr19, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 76, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr20, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 80, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr21, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 84, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr22, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 88, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr23, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 92, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr24, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 96, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr25, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 100, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr26, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 104, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr27, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 108, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr28, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 112, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr29, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 116, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr30, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 120, addrspace 5) + ; MUBUF-GFX90A: BUFFER_STORE_DWORD_OFFSET killed $agpr31, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 4 into %stack.0 + 124, addrspace 5) + ; MUBUF-GFX90A: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0, addrspace 5) + ; MUBUF-GFX90A: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 4, addrspace 5) + ; MUBUF-GFX90A: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 8, addrspace 5) + ; MUBUF-GFX90A: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 12, addrspace 5) + ; MUBUF-GFX90A: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 16, addrspace 5) + ; MUBUF-GFX90A: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 20, addrspace 5) + ; MUBUF-GFX90A: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 24, addrspace 5) + ; MUBUF-GFX90A: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 28, addrspace 5) + ; MUBUF-GFX90A: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 32, addrspace 5) + ; MUBUF-GFX90A: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 36, addrspace 5) + ; MUBUF-GFX90A: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 40, addrspace 5) + ; MUBUF-GFX90A: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 44, addrspace 5) + ; MUBUF-GFX90A: $agpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 48, addrspace 5) + ; MUBUF-GFX90A: $agpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 52, addrspace 5) + ; MUBUF-GFX90A: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 56, addrspace 5) + ; MUBUF-GFX90A: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 60, addrspace 5) + ; MUBUF-GFX90A: $agpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 64, addrspace 5) + ; MUBUF-GFX90A: $agpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 68, addrspace 5) + ; MUBUF-GFX90A: $agpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 72, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 72, addrspace 5) + ; MUBUF-GFX90A: $agpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 76, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 76, addrspace 5) + ; MUBUF-GFX90A: $agpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 80, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 80, addrspace 5) + ; MUBUF-GFX90A: $agpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 84, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 84, addrspace 5) + ; MUBUF-GFX90A: $agpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 88, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 88, addrspace 5) + ; MUBUF-GFX90A: $agpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 92, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 92, addrspace 5) + ; MUBUF-GFX90A: $agpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 96, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 96, addrspace 5) + ; MUBUF-GFX90A: $agpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 100, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 100, addrspace 5) + ; MUBUF-GFX90A: $agpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 104, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 104, addrspace 5) + ; MUBUF-GFX90A: $agpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 108, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 108, addrspace 5) + ; MUBUF-GFX90A: $agpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 112, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 112, addrspace 5) + ; MUBUF-GFX90A: $agpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 116, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 116, addrspace 5) + ; MUBUF-GFX90A: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 120, addrspace 5) + ; MUBUF-GFX90A: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 4 from %stack.0 + 124, addrspace 5) + ; MUBUF-GFX90A: S_ENDPGM 0 + ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a32 + ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; MUBUF-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; MUBUF-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; MUBUF-GFX90A-V2A: S_ENDPGM 0 + ; FLATSCR-GFX90A-LABEL: name: test_spill_a32 + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr4_agpr5_agpr6_agpr7, $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr8_agpr9_agpr10_agpr11, $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr12_agpr13_agpr14_agpr15, $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr16_agpr17_agpr18_agpr19, $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr20_agpr21_agpr22_agpr23, $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr24_agpr25_agpr26_agpr27, $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR-GFX90A: SCRATCH_STORE_DWORDX4_SADDR killed $agpr28_agpr29_agpr30_agpr31, $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store 16 into %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 16, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 32, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 48, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr16_agpr17_agpr18_agpr19 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 64, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr20_agpr21_agpr22_agpr23 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 80, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 80, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 96, align 4, addrspace 5) + ; FLATSCR-GFX90A: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, 0, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load 16 from %stack.0 + 112, align 4, addrspace 5) + ; FLATSCR-GFX90A: S_ENDPGM 0 + ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a32 + ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; FLATSCR-GFX90A-V2A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF + ; FLATSCR-GFX90A-V2A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; FLATSCR-GFX90A-V2A: S_ENDPGM 0 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -29,12 +29,12 @@ ; CHECK: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr4, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr5 = S_MOV_B32 524288 - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_MOV_B32 524288 - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir +++ llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -27,7 +27,7 @@ ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX8: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX8: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 @@ -35,7 +35,7 @@ ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX8: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX8: $vcc_lo = S_MOV_B32 8192 ; GFX8: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec @@ -44,15 +44,15 @@ ; GFX8: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX8: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX8: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec - ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GFX8: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) + ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-LABEL: name: pei_scavenge_vgpr_spill ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX9: $sgpr32 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) ; GFX9: $sgpr32 = S_SUB_U32 $sgpr32, 8196, implicit-def $scc ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 @@ -60,7 +60,7 @@ ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def $scc ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; GFX9: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec @@ -68,15 +68,15 @@ ; GFX9: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX9: $vgpr0 = V_MOV_B32_e32 8196, implicit $exec - ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) ; GFX9: $exec = S_MOV_B64 killed $sgpr4_sgpr5 - ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 20, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5) + ; GFX9-FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.3, addrspace 5) ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32 $sgpr33, 0, undef $vgpr2 ; GFX9-FLATSCR: $sgpr33 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc @@ -89,7 +89,7 @@ ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32 $vgpr2, 0 ; GFX9-FLATSCR: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX9-FLATSCR: $sgpr4 = S_ADD_U32 $sgpr32, 8196, implicit-def $scc - ; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5) + ; GFX9-FLATSCR: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.3, addrspace 5) ; GFX9-FLATSCR: $exec = S_MOV_B64 killed $sgpr4_sgpr5 ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec Index: llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -46,7 +46,7 @@ %15:sreg_32_xm0 = S_MOV_B32 61440 %16:sreg_32_xm0 = S_MOV_B32 -1 %17:sgpr_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 - BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) %19:vgpr_32 = COPY %4 %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 Index: llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -7,33 +7,33 @@ body: | bb.0: ; GCN-LABEL: name: bundle_memops - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_NOP 0 ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit undef $vgpr3_vgpr4, implicit $exec { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: S_NOP 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr5, implicit-def $vgpr5_lo16, implicit-def $vgpr5_hi16, implicit undef $vgpr0_vgpr1, implicit $exec, implicit undef $vgpr3_vgpr4 { - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec - ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec - ; GCN: $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 { - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: S_NOP 0 ; GCN: BUNDLE implicit undef $vgpr3_vgpr4, implicit $vgpr1, implicit $exec, implicit $vgpr0 { - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: S_NOP 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: S_NOP 0 - ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + ; GCN: GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit $exec, implicit $vgpr1 { ; GCN: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec ; GCN: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec @@ -48,20 +48,20 @@ ; GCN: $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0 ; GCN: } ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit $vgpr0, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr2, implicit $exec, implicit $vgpr1 { - ; GCN: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: BUNDLE implicit $vgpr0, implicit $vgpr2_vgpr3, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec { - ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: BUNDLE implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit undef $vgpr4_vgpr5_vgpr6_vgpr7, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) - ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; GCN: $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; GCN: $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) ; GCN: } ; GCN: BUNDLE implicit undef $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + ; GCN: IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) ; GCN: } ; GCN: S_NOP 0 ; GCN: $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0 @@ -71,25 +71,25 @@ ; GCN: $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec ; GCN: $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec ; GCN: } - $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 - $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec S_NOP 0 - $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, implicit $exec - $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, implicit $exec - $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD undef $vgpr0_vgpr1, 4, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 4, 0, 0, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec S_NOP 0 - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr1, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 - $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD undef $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 - GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr3_vgpr4, $vgpr0, 4, 0, 0, 0, 0, implicit $exec $vgpr2 = DS_READ_B32_gfx9 $vgpr0, 0, 0, implicit $exec $vgpr3 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $exec DS_WRITE_B32_gfx9 $vgpr0, $vgpr2, 0, 0, implicit killed $m0, implicit $exec @@ -97,14 +97,14 @@ S_NOP 0 $sgpr2 = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0 $sgpr3 = S_LOAD_DWORD_SGPR undef $sgpr0_sgpr1, undef $sgpr10, 0, 0 - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) - $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) - IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) - IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr2, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 $vgpr0, $vgpr2_vgpr3, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + $vgpr3 = IMAGE_LOAD_V1_V4 undef $vgpr4_vgpr5_vgpr6_vgpr7, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) + IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + IMAGE_STORE_V4_V2 undef $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr0_vgpr1, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) S_NOP 0 $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 464, 0, 0 $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 = S_LOAD_DWORDX8_IMM undef $sgpr10_sgpr11, 128, 0, 0 @@ -123,13 +123,13 @@ ; GCN-LABEL: name: bundle_dbg_value_0 ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: DBG_VALUE internal $vgpr0, 0, 0 - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec DBG_VALUE $vgpr0, 0, 0 - $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ... @@ -143,16 +143,16 @@ ; GCN-LABEL: name: bundle_dbg_value_1 ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: DBG_VALUE internal $vgpr0, 0, 0 ; GCN: DBG_VALUE $vgpr1, 0, 0 - ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: DBG_VALUE $vgpr2, 0, 0 - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec DBG_VALUE $vgpr0, 0, 0 DBG_VALUE $vgpr1, 0, 0 - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec DBG_VALUE $vgpr2, 0, 0 ... @@ -167,15 +167,15 @@ ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6, $vgpr1 ; GCN: DBG_VALUE $vgpr1, 0, 0 ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: DBG_VALUE internal $vgpr0, 0, 0 - ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: DBG_VALUE $vgpr2, 0, 0 DBG_VALUE $vgpr1, 0, 0 - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec DBG_VALUE $vgpr0, 0, 0 - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec DBG_VALUE $vgpr2, 0, 0 ... @@ -189,14 +189,14 @@ ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr1, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: KILL $vgpr1 - ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } $vgpr1 = V_MOV_B32_e32 0, implicit $exec - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec KILL $vgpr1 - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ... @@ -210,14 +210,14 @@ ; GCN: liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 ; GCN: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec ; GCN: KILL internal $vgpr0 - ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } $vgpr1 = V_MOV_B32_e32 0, implicit $exec - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec KILL $vgpr0 - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ... @@ -232,11 +232,11 @@ ; GCN-LABEL: name: post_bundle_kill ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec KILL killed $vgpr3_vgpr4, killed $vgpr5_vgpr6 ... @@ -249,13 +249,13 @@ ; GCN-LABEL: name: post_bundle_kill_other ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: KILL killed $vgpr7 $vgpr7 = V_MOV_B32_e32 0, implicit $exec - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec KILL killed $vgpr7 ... @@ -269,12 +269,12 @@ ; GCN-LABEL: name: post_bundle_kill_plus_other ; GCN: $vgpr7 = V_MOV_B32_e32 0, implicit $exec ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec ; GCN: } ; GCN: KILL killed $vgpr7, killed $vgpr3 $vgpr7 = V_MOV_B32_e32 0, implicit $exec - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec - $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, 0, implicit $exec KILL killed $vgpr7, killed $vgpr3 ... Index: llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir +++ llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir @@ -16,6 +16,6 @@ S_BARRIER $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32_e64 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $mode, implicit $exec $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr31, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx10.mir @@ -34,12 +34,12 @@ %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %25, %21, 0, implicit $exec %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 6144 %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -87,17 +87,17 @@ %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 6400 %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec %39:sgpr_32 = S_MOV_B32 11200 %40:vgpr_32, %41:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec %42:vgpr_32, dead %43:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 - %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -144,17 +144,17 @@ %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 8192 %33:vgpr_32, %34:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec %39:sgpr_32 = S_MOV_B32 10240 %40:vgpr_32, %41:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec %42:vgpr_32, dead %43:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 - %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -190,7 +190,7 @@ %26:vgpr_32, %27:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_32_xm0_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -208,11 +208,11 @@ %2:vgpr_32, %3:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec %4:vgpr_32, dead %5:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 - GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, 0, implicit $exec %8:sgpr_32 = S_MOV_B32 3000 %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec %11:vgpr_32, dead %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 - GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir @@ -34,12 +34,12 @@ %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %25, %21, 0, implicit $exec %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 6144 %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -87,17 +87,17 @@ %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 6400 %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec %39:sgpr_32 = S_MOV_B32 11200 %40:vgpr_32, %41:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 - %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -140,17 +140,17 @@ %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec %32:sgpr_32 = S_MOV_B32 8192 %33:vgpr_32, %34:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %32, 0, implicit $exec %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, 0, implicit $exec %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 - %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, implicit $exec + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, 0, 0, implicit $exec %39:sgpr_32 = S_MOV_B32 10240 %40:vgpr_32, %41:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %39, 0, implicit $exec %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, 0, implicit $exec %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 - %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, implicit $exec + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, 0, 0, implicit $exec ... --- @@ -186,13 +186,13 @@ %26:vgpr_32, %27:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %25, 0, implicit $exec %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 4294967295, killed %27, 0, implicit $exec %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 - %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, implicit $exec + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, 0, 0, implicit $exec ... --- # GFX9-LABEL: name: diffoporder_add_store -# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0, 0, 0 -# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0, 0, 0 +# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub0, 1000, 0, 0, 0, 0 +# GFX9: GLOBAL_STORE_DWORD %{{[0-9]+}}, %0.sub1, 0, 0, 0, 0, 0 name: diffoporder_add_store body: | @@ -204,11 +204,11 @@ %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 - GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %6, %0.sub0, 0, 0, 0, 0, 0, implicit $exec %8:sgpr_32 = S_MOV_B32 3000 %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 - GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %13, %0.sub1, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/regbank-reassign.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -29,7 +29,7 @@ bb.0: %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -45,7 +45,7 @@ bb.0: %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF - GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -241,15 +241,15 @@ %14 = IMPLICIT_DEF %15 = IMPLICIT_DEF %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -287,18 +287,18 @@ %11 = IMPLICIT_DEF %12 = IMPLICIT_DEF %2 = V_AND_B32_e32 %1, %0, implicit $exec - GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir +++ llvm/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir @@ -185,7 +185,7 @@ bb.28: %9 = S_FF1_I32_B32 undef %10 %13 = V_MAD_U32_U24_e64 killed %9, 48, 32, 0, implicit $exec - %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) %46 = V_AND_B32_e32 1, killed %45, implicit $exec %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4) %25 = nofpexcept V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -70,7 +70,7 @@ %13.sub2_sub3 = COPY killed %12 %20 = V_LSHL_B64_e64 killed %19, 2, implicit $exec %16 = COPY killed %5 - BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -62,12 +62,12 @@ bb.3: %1 = COPY killed %17 - FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %14 = COPY %1.sub1 %16 = COPY killed %1.sub0 undef %15.sub0 = COPY killed %16 %15.sub1 = COPY killed %14 - FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -132,10 +132,10 @@ %6.sub2 = COPY %6.sub0 bb.2: - BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 S_SETPC_B64_return $sgpr30_sgpr31 Index: llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/reserved-reg-in-clause.mir @@ -0,0 +1,28 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s + +# Make sure we do not produce early-clobber list with odd subregs. +# Odd vector subregs are reserved on gfx90a and verifier complaints after RA. + +# GCN-LABEL: name: long_reg_clause +# GCN: dead early-clobber %2.sub0_sub1_sub2_sub3:areg_512, undef early-clobber %4.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15:areg_512, dead early-clobber %3:areg_512 = BUNDLE %0, implicit $exec { +--- +name: long_reg_clause +body: | + bb.0.entry: + %0:vreg_64 = IMPLICIT_DEF + undef %1.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -208, 0, 0, 0, 0, implicit $exec + %1.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -224, 0, 0, 0, 0, implicit $exec + %1.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -240, 0, 0, 0, 0, implicit $exec + dead %1.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -256, 0, 0, 0, 0, implicit $exec + undef %2.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -80, 0, 0, 0, 0, implicit $exec + %2.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -96, 0, 0, 0, 0, implicit $exec + %2.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -112, 0, 0, 0, 0, implicit $exec + dead %2.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, -128, 0, 0, 0, 0, implicit $exec + undef %3.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, 0, implicit $exec + %3.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, 0, implicit $exec + %3.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, 0, implicit $exec + dead %3.sub0_sub1_sub2_sub3:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + undef %4.sub12_sub13_sub14_sub15:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 176, 0, 0, 0, 0, implicit $exec + %4.sub8_sub9_sub10_sub11:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 160, 0, 0, 0, 0, implicit $exec + %4.sub4_sub5_sub6_sub7:areg_512 = GLOBAL_LOAD_DWORDX4 %0, 144, 0, 0, 0, 0, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/reserved-vgpr-tuples.mir @@ -0,0 +1,248 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX908 %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -start-before=greedy -stop-after=virtregrewriter -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,GFX90A %s + +--- +# GCN-LABEL: name: alloc_vgpr_64 +# GFX908: $vgpr3_vgpr4 = GLOBAL_LOAD +# GFX90A: $vgpr4_vgpr5 = GLOBAL_LOAD +name: alloc_vgpr_64 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_96 +# GFX908: $vgpr3_vgpr4_vgpr5 = GLOBAL_LOAD +# GFX90A: $vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD +name: alloc_vgpr_96 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_96 = GLOBAL_LOAD_DWORDX3 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX3 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_128 +# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6 = GLOBAL_LOAD +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7 = GLOBAL_LOAD +name: alloc_vgpr_128 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_160 +# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_LOAD +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_LOAD +name: alloc_vgpr_160 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_160 = IMAGE_LOAD_V5_V1 %1, undef %3:sgpr_256, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_256 +# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 = COPY +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = COPY +name: alloc_vgpr_256 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %3:sgpr_256 = IMPLICIT_DEF + %2:vreg_256 = COPY %3:sgpr_256 + %4:vreg_128 = IMAGE_SAMPLE_C_CL_O_V4_V8 %2, %3:sgpr_256, undef %5:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, addrspace 4) + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_512 +# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18 = IMPLICIT_DEF +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF +name: alloc_vgpr_512 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_512 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_vgpr_1024 +# GFX908: $vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34 = IMPLICIT_DEF +# GFX90A: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF +name: alloc_vgpr_1024 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$vgpr2' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vgpr_32 = COPY $vgpr2 + %2:vreg_1024 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_64 +# GFX908: $agpr1_agpr2 = IMPLICIT_DEF +# GFX90A: $agpr2_agpr3 = IMPLICIT_DEF +name: alloc_agpr_64 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %3:areg_64 = IMPLICIT_DEF + %2:vreg_64 = COPY %3:areg_64 + GLOBAL_STORE_DWORDX2 %0, %2, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_128 +# GFX908: $agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF +# GFX90A: $agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF +name: alloc_agpr_128 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %3:areg_128 = IMPLICIT_DEF + %2:vreg_128 = COPY %3:areg_128 + GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_512 +# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = IMPLICIT_DEF +# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17 = IMPLICIT_DEF +name: alloc_agpr_512 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %3:areg_512 = IMPLICIT_DEF + %2:vreg_512 = COPY %3:areg_512 + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: alloc_agpr_1024 +# GFX908: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32 = IMPLICIT_DEF +# GFX90A: $agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31_agpr32_agpr33 = IMPLICIT_DEF +name: alloc_agpr_1024 +tracksRegLiveness: true +liveins: + - { reg: '$vgpr0_vgpr1' } + - { reg: '$agpr0' } +body: | + bb.0: + liveins: $vgpr0_vgpr1, $agpr0 + + %0:vreg_64 = COPY $vgpr0_vgpr1 + %3:areg_1024 = IMPLICIT_DEF + %2:vreg_1024 = COPY %3:areg_1024 + GLOBAL_STORE_DWORDX4 %0, %2.sub0_sub1_sub2_sub3, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub4_sub5_sub6_sub7, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub8_sub9_sub10_sub11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub12_sub13_sub14_sub15, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub16_sub17_sub18_sub19, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub20_sub21_sub22_sub23, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub24_sub25_sub26_sub27, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %0, %2.sub28_sub29_sub30_sub31, 0, 0, 0, 0, 0, implicit $exec + %1:vgpr_32 = COPY $agpr0 + GLOBAL_STORE_DWORD %0, %1, 0, 0, 0, 0, 0, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/s_code_end.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/s_code_end.ll +++ llvm/test/CodeGen/AMDGPU/s_code_end.ll @@ -4,6 +4,9 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX90AEND-ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx90a --disassemble - | FileCheck -check-prefixes=GCN,GCN-OBJ,GFX90AEND-OBJ %s + ; GCN: a_kernel1{{>?}}: ; GCN: s_endpgm ; GCN-ASM: [[END_LABEL1:\.Lfunc_end.*]]: @@ -11,7 +14,7 @@ ; GCN-OBJ-NEXT: s_nop 0 -define amdgpu_kernel void @a_kernel1() { +define amdgpu_kernel void @a_kernel1() #0 { ret void } @@ -22,7 +25,7 @@ ; GCN-OBJ: {{^$}} -define amdgpu_kernel void @a_kernel2() { +define amdgpu_kernel void @a_kernel2() #0 { ret void } @@ -35,15 +38,22 @@ ; GCN-ASM-NEXT: [[END_LABEL3:\.Lfunc_end.*]]: ; GCN-ASM-NEXT: .size a_function, [[END_LABEL3]]-a_function ; GFX10END-ASM: .p2alignl 6, 3214868480 +; GFX90AEND-ASM: .p2alignl 6, 3212836864 ; GFX10END-ASM-NEXT: .fill 48, 4, 3214868480 +; GFX90AEND-ASM-NEXT: .fill 256, 4, 3212836864 ; GFX10NOEND-NOT: .fill ; GFX10NOEND-OBJ-NOT: s_code_end ; GFX10END-OBJ-NEXT: s_code_end +; GFX90AEND-OBJ-NEXT: s_nop 0 ; GFX10END-OBJ: s_code_end // 000000000140: ; GFX10END-OBJ-COUNT-47: s_code_end +; GFX90AEND-OBJ: s_nop 0 // 000000000140: +; GFX90AEND-OBJ-COUNT-255: s_nop 0 -define void @a_function() { +define void @a_function() #0 { ret void } + +attributes #0 = { "amdgpu-flat-work-group-size"="1,512" } Index: llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -27,7 +27,7 @@ ; CHECK: [[COPY:%[0-9]+]]:vreg_512 = COPY %0 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) ; CHECK: dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec ; CHECK: dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; CHECK: dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec @@ -51,7 +51,7 @@ %4:vreg_512 = COPY %0 bb.1: - BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec %8:vreg_64 = DS_READ_B64_gfx9 %1, 0, 0, implicit $exec %9:vreg_128 = DS_READ_B128_gfx9 %2, 0, 0, implicit $exec Index: llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -23,12 +23,12 @@ ; CHECK: liveins: $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec + ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, 0, implicit $exec ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec - ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, 0, implicit $exec ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF @@ -42,17 +42,17 @@ ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec - ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec + ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, 0, implicit $exec ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec - ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec - ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec + ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, 0, implicit $exec ; CHECK: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF1]], implicit $exec - ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec + ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, 0, implicit $exec ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]] - ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec + ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, 0, implicit $exec ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode @@ -69,14 +69,14 @@ %0:vgpr_32 = COPY $vgpr0 %1:vreg_64 = IMPLICIT_DEF - %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, 0, 0, implicit $exec - %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, 0, 0, implicit $exec + %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, 0, 0, 0, implicit $exec undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0, %0, implicit $exec %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec %5:vreg_64 = COPY %2 undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $mode, implicit $exec %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $mode, implicit $exec - %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, 0, 0, 0, implicit $exec %8:vreg_64 = IMPLICIT_DEF %9:vreg_64 = IMPLICIT_DEF %10:vreg_64 = IMPLICIT_DEF @@ -90,15 +90,15 @@ %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $mode, implicit $exec %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $mode, implicit $exec - GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec - %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, 0, implicit $exec - %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, 0, implicit $exec - %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec - %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, 0, 0, implicit $exec - %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, 0, implicit $exec + %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, 0, 0, 0, implicit $exec + %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, 0, 0, 0, implicit $exec + %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, 0, implicit $exec + %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, 0, 0, 0, implicit $exec + %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, 0, 0, 0, implicit $exec %23:vreg_64 = V_LSHLREV_B64_e64 2, %8, implicit $exec S_NOP 0, implicit %13, implicit %23.sub0, implicit %12, implicit %17 - GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %15, %18, 0, 0, 0, 0, 0, implicit $exec bb.1: S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode Index: llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -217,7 +217,7 @@ %19:sreg_32_xm0_xexec = IMPLICIT_DEF %20:vgpr_32 = V_ADD_CO_U32_e32 %19, %0, implicit-def dead $vcc, implicit $exec %21:vreg_64, dead %22:sreg_64 = V_MAD_I64_I32_e64 %20, 12, %7, 0, implicit $exec - %23:vgpr_32 = GLOBAL_LOAD_DWORD %21, 4, 0, 0, 0, implicit $exec + %23:vgpr_32 = GLOBAL_LOAD_DWORD %21, 4, 0, 0, 0, 0, implicit $exec %24:vreg_64, dead %25:sreg_64 = V_MAD_I64_I32_e64 %20, 48, %8, 0, implicit $exec %26:vreg_128 = IMPLICIT_DEF undef %27.sub0:sreg_64_xexec = S_LOAD_DWORD_IMM %6, 0, 0, 0 @@ -232,14 +232,14 @@ %33:sgpr_32 = S_ADDC_U32 %5.sub1, %31.sub1, implicit-def dead $scc, implicit killed $scc %34:vgpr_32 = IMPLICIT_DEF %35:vreg_64, dead %36:sreg_64 = V_MAD_I64_I32_e64 %23, %34, 0, 0, implicit $exec - %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 32, 0, 0, 0, implicit $exec + %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 32, 0, 0, 0, 0, implicit $exec undef %38.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %37.sub0, implicit $exec %38.sub0:vreg_64 = COPY %37.sub0 %39:vreg_64 = V_LSHLREV_B64_e64 3, %38, implicit $exec undef %40.sub0:vreg_64, %41:sreg_64_xexec = V_ADD_CO_U32_e64 0, %39.sub0, 0, implicit $exec %42:vgpr_32 = COPY %33 %40.sub1:vreg_64, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %42, %39.sub1, %41, 0, implicit $exec - %44:vreg_64 = GLOBAL_LOAD_DWORDX2 %40, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.tmp34) + %44:vreg_64 = GLOBAL_LOAD_DWORDX2 %40, 0, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.tmp34) undef %45.sub1:vreg_64 = IMPLICIT_DEF %45.sub0:vreg_64 = COPY %37.sub1 %46:vreg_64 = V_LSHLREV_B64_e64 3, %45, implicit $exec @@ -247,7 +247,7 @@ %49:vgpr_32 = COPY %33 %47.sub1:vreg_64, dead %50:sreg_64_xexec = V_ADDC_U32_e64 %49, %46.sub1, %48, 0, implicit $exec %51:vreg_64 = IMPLICIT_DEF - undef %52.sub0:vreg_64 = GLOBAL_LOAD_DWORD %35, 40, 0, 0, 0, implicit $exec :: (load 4 from %ir.18 + 8) + undef %52.sub0:vreg_64 = GLOBAL_LOAD_DWORD %35, 40, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.18 + 8) %52.sub1:vreg_64 = IMPLICIT_DEF %53:vreg_64 = V_LSHLREV_B64_e64 3, %52, implicit $exec undef %54.sub0:vreg_64, %55:sreg_64_xexec = V_ADD_CO_U32_e64 0, %53.sub0, 0, implicit $exec @@ -258,31 +258,31 @@ %59:sreg_64 = IMPLICIT_DEF %60:sreg_32_xm0 = S_ADD_U32 %5.sub0, %59.sub0, implicit-def $scc %61:sgpr_32 = S_ADDC_U32 %5.sub1, %59.sub1, implicit-def dead $scc, implicit killed $scc - %62:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.20, align 4) + %62:vreg_64 = GLOBAL_LOAD_DWORDX2 %35, 0, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.20, align 4) undef %63.sub1:vreg_64 = V_ASHRREV_I32_e32 31, %62.sub0, implicit $exec %63.sub0:vreg_64 = COPY %62.sub0 %64:vreg_64 = IMPLICIT_DEF undef %65.sub0:vreg_64, %66:sreg_64_xexec = V_ADD_CO_U32_e64 %60, %64.sub0, 0, implicit $exec %67:vgpr_32 = COPY %61 %65.sub1:vreg_64, dead %68:sreg_64_xexec = V_ADDC_U32_e64 %67, %64.sub1, %66, 0, implicit $exec - %69:vreg_128 = GLOBAL_LOAD_DWORDX4 %65, 0, 0, 0, 0, implicit $exec :: (load 16 from %ir.tmp58) + %69:vreg_128 = GLOBAL_LOAD_DWORDX4 %65, 0, 0, 0, 0, 0, implicit $exec :: (load 16 from %ir.tmp58) undef %70.sub1:vreg_64 = IMPLICIT_DEF %70.sub0:vreg_64 = IMPLICIT_DEF %71:vreg_64 = IMPLICIT_DEF undef %72.sub0:vreg_64, %73:sreg_64_xexec = V_ADD_CO_U32_e64 %60, %71.sub0, 0, implicit $exec %74:vgpr_32 = COPY %61 %72.sub1:vreg_64, dead %75:sreg_64_xexec = V_ADDC_U32_e64 0, %71.sub1, %73, 0, implicit $exec - %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, 0, implicit $exec + %76:vreg_128 = GLOBAL_LOAD_DWORDX4 %72, 0, 0, 0, 0, 0, implicit $exec %77:vgpr_32 = IMPLICIT_DEF %78:vgpr_32 = IMPLICIT_DEF %79:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %77, implicit $mode, implicit $exec %80:vgpr_32 = IMPLICIT_DEF %81:vgpr_32 = IMPLICIT_DEF %84:vgpr_32 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, 0, implicit $exec %85:vgpr_32 = IMPLICIT_DEF %86:vgpr_32 = IMPLICIT_DEF %87:vgpr_32 = IMPLICIT_DEF Index: llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -25,7 +25,7 @@ ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr4_sgpr5 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; CHECK: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4) ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 5329 ; CHECK: undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec @@ -37,7 +37,7 @@ ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead %11 - ; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3) @@ -52,7 +52,7 @@ ; CHECK: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3) ; CHECK: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3) ; CHECK: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) - ; CHECK: undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; CHECK: undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] @@ -71,7 +71,7 @@ ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0, 0 :: (load 4, addrspace 1) ; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */ ; CHECK: [[DS_READ_B32_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load 4, addrspace 3) - ; CHECK: GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; CHECK: GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; CHECK: %31.sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec ; CHECK: DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store 8, addrspace 3) ; CHECK: S_BRANCH %bb.1 @@ -79,7 +79,7 @@ liveins: $sgpr4_sgpr5 %0:sgpr_64(p4) = COPY $sgpr4_sgpr5 - %1:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + %1:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) %3:sgpr_64 = S_LOAD_DWORDX2_IMM %0(p4), 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4) %4:sreg_32_xm0 = S_MOV_B32 5329 undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec @@ -91,7 +91,7 @@ bb.1: INLINEASM &"", 1, 851978, def %11:vgpr_32 - GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3) INLINEASM &"def $0 $1", 1, 851978, def %15:vgpr_32, 851978, def %16:vgpr_32 %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec @@ -108,7 +108,7 @@ DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3) DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3) DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) - undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %33:vgpr_32 = V_MUL_LO_U32_e64 %25, %4, implicit $exec %10:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %25, %26, implicit $exec %34:vgpr_32 = V_SUB_U32_e32 %33, %9, implicit $exec @@ -125,7 +125,7 @@ %43:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0, 0 :: (load 4, addrspace 1) INLINEASM &"", 1 %44:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load 4, addrspace 3) - GLOBAL_STORE_DWORD undef %46:vreg_64, %44, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + GLOBAL_STORE_DWORD undef %46:vreg_64, %44, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %31.sub0:vreg_64 = COPY %43, implicit $exec DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store 8, addrspace 3) S_BRANCH %bb.1 Index: llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir +++ llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir @@ -12,15 +12,15 @@ ; CHECK-LABEL: name: denorm_mode_not_barrier ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4) - ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4) ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]] %0:vreg_64 = COPY $vgpr0_vgpr1 - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) S_DENORM_MODE 0, implicit-def $mode, implicit $mode - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4) %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec S_ENDPGM 0, implicit %3 ... @@ -35,15 +35,15 @@ ; CHECK-LABEL: name: round_mode_not_barrier ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4) - ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4) ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD1]], implicit $exec ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode ; CHECK: S_ENDPGM 0, implicit [[V_ADD_U32_e32_]] %0:vreg_64 = COPY $vgpr0_vgpr1 - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) S_ROUND_MODE 0, implicit-def $mode, implicit $mode - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4) + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4) %3:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec S_ENDPGM 0, implicit %3 ... @@ -58,17 +58,17 @@ ; CHECK-LABEL: name: denorm_mode_mode_def_use ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4) - ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4) ; CHECK: S_DENORM_MODE 0, implicit-def $mode, implicit $mode ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]] %0:vreg_64 = COPY $vgpr0_vgpr1 - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) S_DENORM_MODE 0, implicit-def $mode, implicit $mode %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4) %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec S_ENDPGM 0, implicit %2, implicit %4 ... @@ -83,17 +83,17 @@ ; CHECK-LABEL: name: round_mode_mode_def_use ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, implicit $exec :: (load 4) - ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, 0, 0, 0, implicit $exec :: (load 4) + ; CHECK: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, 0, 0, 0, implicit $exec :: (load 4) ; CHECK: S_ROUND_MODE 0, implicit-def $mode, implicit $mode ; CHECK: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec ; CHECK: S_ENDPGM 0, implicit [[V_ADD_F32_e32_]], implicit [[V_ADD_U32_e32_]] %0:vreg_64 = COPY $vgpr0_vgpr1 - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec :: (load 4) + %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec :: (load 4) S_ROUND_MODE 0, implicit-def $mode, implicit $mode %2:vgpr_32 = V_ADD_F32_e32 0, %1, implicit $mode, implicit $exec - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec :: (load 4) + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, 0, implicit $exec :: (load 4) %4:vgpr_32 = V_ADD_U32_e32 %1, %2, implicit $exec S_ENDPGM 0, implicit %2, implicit %4 ... Index: llvm/test/CodeGen/AMDGPU/schedule-barrier.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/schedule-barrier.mir +++ llvm/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -1,18 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=machine-scheduler -o - %s | FileCheck %s --- # Check that the high latency loads are both scheduled first, before the # multiplies, despite the presence of a barrier in the function. -# CHECK: BUFFER_LOAD_DWORD_OFFSET -# CHECK: BUFFER_LOAD_DWORD_OFFSET -# CHECK: V_MUL_LO_U32_e64 -# CHECK: V_MUL_LO_U32_e64 name: test tracksRegLiveness: true body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK-LABEL: name: test + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9 + ; CHECK: undef %0.sub3:vreg_128 = COPY $vgpr9 + ; CHECK: undef %1.sub2:vreg_128 = COPY $vgpr8 + ; CHECK: undef %2.sub1:vreg_128 = COPY $vgpr7 + ; CHECK: undef %8.sub1:vreg_64 = COPY $vgpr1 + ; CHECK: %8.sub0:vreg_64 = COPY $vgpr0 + ; CHECK: undef %3.sub0:vreg_128 = COPY $vgpr6 + ; CHECK: undef %4.sub3:vreg_128 = COPY $vgpr5 + ; CHECK: undef %5.sub2:vreg_128 = COPY $vgpr4 + ; CHECK: undef %6.sub1:vreg_128 = COPY $vgpr3 + ; CHECK: undef %7.sub0:vreg_128 = COPY $vgpr2 + ; CHECK: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec + ; CHECK: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec + ; CHECK: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec + ; CHECK: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec + ; CHECK: S_BARRIER + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec + ; CHECK: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec + ; CHECK: %12.sub2:sgpr_128 = V_READFIRSTLANE_B32 %1.sub2, implicit $exec + ; CHECK: %12.sub3:sgpr_128 = V_READFIRSTLANE_B32 %0.sub3, implicit $exec + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %12, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec + ; CHECK: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec + ; CHECK: GLOBAL_STORE_DWORD %8, [[V_ADD_U32_e32_]], 0, 0, 0, 0, 0, implicit $exec + ; CHECK: S_ENDPGM 0 undef %43.sub3:vreg_128 = COPY $vgpr9 undef %42.sub2:vreg_128 = COPY $vgpr8 undef %41.sub1:vreg_128 = COPY $vgpr7 @@ -30,17 +55,17 @@ %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec - %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %39:vgpr_32 = V_MUL_LO_U32_e64 %15, %15, implicit $exec undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec - %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %40:vgpr_32 = V_MUL_LO_U32_e64 %19, %19, implicit $exec %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec - GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %38, %23, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir @@ -37,11 +37,11 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %12 = S_MOV_B32 123 %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec %11 = V_ADD_CO_U32_e32 %12, killed %10, implicit-def $vcc, implicit $exec - FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 @@ -80,9 +80,9 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit $mode, implicit $exec, implicit-def $vcc - FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %11, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 Index: llvm/test/CodeGen/AMDGPU/sdwa-ops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-ops.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-ops.mir @@ -29,19 +29,19 @@ %63:vgpr_32, %65:sreg_64_xexec = nsw V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec %64:vgpr_32, dead %66:sreg_64_xexec = nuw V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec %163:vgpr_32, %165:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %161, 0, implicit $exec %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, 0, implicit $exec %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) %171:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec %173:vgpr_32, %175:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %171, 0, implicit $exec %174:vgpr_32, dead %176:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %175, 0, implicit $exec %172:vreg_64 = REG_SEQUENCE %173, %subreg.sub0, %174, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %172, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -77,13 +77,13 @@ %64:vgpr_32, dead %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) %161:vgpr_32 = V_AND_B32_e32 %22, %0, implicit $exec %163:vgpr_32, %165:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %161, 0, implicit $exec %164:vgpr_32, dead %166:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %165, 0, implicit $exec %162:vreg_64 = REG_SEQUENCE %163, %subreg.sub0, %164, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %162, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -113,7 +113,7 @@ %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, killed %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %66, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -143,7 +143,7 @@ %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %65, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -172,7 +172,7 @@ %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -201,7 +201,7 @@ %30:vreg_64 = COPY $sgpr0_sgpr1 %63:vgpr_32, %65:sreg_64_xexec = V_ADD_CO_U32_e64 %30.sub0, %23, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %23, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -232,7 +232,7 @@ %30:vreg_64 = COPY $sgpr0_sgpr1 %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %24, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %23, %subreg.sub0, %23, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %30.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -263,7 +263,7 @@ %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %31:vreg_64 = COPY $vcc %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -294,7 +294,8 @@ %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %31:vreg_64 = COPY $vcc %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + ... @@ -325,7 +326,7 @@ %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1 %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -356,7 +357,7 @@ %32:vreg_64 = REG_SEQUENCE %31, %subreg.sub0, %23, %subreg.sub1 %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %32.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) ... @@ -386,5 +387,5 @@ %31:vreg_64 = COPY killed $vcc %64:vgpr_32, %66:sreg_64_xexec = V_ADDC_U32_e64 %30.sub1, %0, %65, 0, implicit $exec %62:vreg_64 = REG_SEQUENCE %63, %subreg.sub0, %64, %subreg.sub1 - GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) + GLOBAL_STORE_DWORDX2_SADDR %31.sub0, %62, %1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8) Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir @@ -80,7 +80,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = S_MOV_B32 65535 %6 = S_MOV_B32 65535 @@ -130,7 +130,7 @@ %100 = V_MOV_B32_e32 %48, implicit $exec - FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 @@ -227,7 +227,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = S_MOV_B32 65535 %6 = S_MOV_B32 65535 @@ -286,7 +286,7 @@ %100 = V_MOV_B32_e32 %60, implicit $exec - FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -89,7 +89,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = S_MOV_B32 65535 %6 = S_MOV_B32 65535 @@ -139,7 +139,7 @@ %100 = V_MOV_B32_e32 %48, implicit $exec - FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 @@ -256,7 +256,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = S_MOV_B32 65535 %6 = S_MOV_B32 65535 @@ -315,7 +315,7 @@ %100 = V_MOV_B32_e32 %60, implicit $exec - FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 @@ -400,7 +400,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = S_MOV_B32 65535 %6 = S_MOV_B32 65535 @@ -441,7 +441,7 @@ %100 = V_MOV_B32_e32 $vcc_lo, implicit $exec - FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 ... Index: llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -36,8 +36,8 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) - %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %5 = V_AND_B32_e32 65535, %3, implicit $exec %6 = V_LSHRREV_B32_e64 16, %4, implicit $exec @@ -51,7 +51,7 @@ %13 = V_OR_B32_e64 %10, %12, implicit $exec - FLAT_STORE_DWORD %0, %13, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %13, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 @@ -88,14 +88,14 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) - %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec %10:sreg_32_xm0 = S_MOV_B32 255 %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0) - FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) S_ENDPGM 0 ... @@ -131,14 +131,14 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) - %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %4 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %9:vgpr_32 = V_LSHRREV_B16_e64 8, %3, implicit $exec %10:sreg_32_xm0 = S_MOV_B32 65535 %11:vgpr_32 = V_AND_B32_e64 %3, killed %10, implicit $exec %17:vgpr_32 = V_MOV_B32_sdwa 0, %4, 0, 5, 2, 4, implicit $exec, implicit %11(tied-def 0) - FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %17, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -221,26 +221,26 @@ %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1 %18 = COPY %16 - %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) + %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) %60 = V_BFE_U32_e64 %17, 8, 8, implicit $exec %61 = V_LSHLREV_B32_e32 2, killed %60, implicit $exec %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec %66 = COPY %13 %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1 - FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) + FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) %37 = S_ADD_U32 %14, 4, implicit-def $scc %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc %71 = COPY killed %37 %72 = COPY killed %38 %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1 - %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) + %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) %73 = V_BFE_U32_e64 %40, 8, 8, implicit $exec %74 = V_LSHLREV_B32_e32 2, killed %73, implicit $exec %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1 - FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) + FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1 @@ -384,26 +384,26 @@ %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead $scc, implicit $scc %16 = REG_SEQUENCE %14, %subreg.sub0, %15, %subreg.sub1 %18 = COPY %16 - %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) + %17 = FLAT_LOAD_DWORD %18, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.uglygep45) %60 = V_BFE_U32_e64 %17, 8, 8, implicit $exec %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit $exec %70 = V_ADD_CO_U32_e32 %7.sub0, %61, implicit-def $vcc, implicit $exec %66 = COPY %13 %65 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec %67 = REG_SEQUENCE %70, %subreg.sub0, killed %65, %subreg.sub1 - FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) + FLAT_STORE_DWORD %67, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp9) %37 = S_ADD_U32 %14, 4, implicit-def $scc %38 = S_ADDC_U32 %15, 0, implicit-def dead $scc, implicit $scc %71 = COPY killed %37 %72 = COPY killed %38 %41 = REG_SEQUENCE killed %71, %subreg.sub0, killed %72, %subreg.sub1 - %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) + %40 = FLAT_LOAD_DWORD killed %41, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.scevgep) %73 = V_BFE_U32_e64 %40, 8, 8, implicit $exec %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit $exec %83 = V_ADD_CO_U32_e32 %7.sub0, %74, implicit-def $vcc, implicit $exec %78 = V_ADDC_U32_e32 0, %66, implicit-def $vcc, implicit $vcc, implicit $exec %80 = REG_SEQUENCE %83, %subreg.sub0, killed %78, %subreg.sub1 - FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) + FLAT_STORE_DWORD %80, %30, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.tmp17) %55 = S_ADD_U32 %0.sub0, 8, implicit-def $scc %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead $scc, implicit $scc %57 = REG_SEQUENCE %55, %subreg.sub0, killed %56, %subreg.sub1 Index: llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir +++ llvm/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir @@ -41,7 +41,7 @@ %2 = COPY $sgpr30_sgpr31 %1 = COPY $vgpr2_vgpr3 %0 = COPY $vgpr0_vgpr1 - %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) %12 = V_LSHRREV_B32_e64 16, %3, implicit $exec %13 = V_BCNT_U32_B32_e64 %3, killed %12, implicit-def $vcc, implicit $exec @@ -56,6 +56,6 @@ %19 = V_READLANE_B32 killed %18, 0, implicit-def $vcc, implicit $exec %20 = V_MOV_B32_e64 %19, implicit $exec - FLAT_STORE_DWORD %0, %20, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) + FLAT_STORE_DWORD %0, %20, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4) $sgpr30_sgpr31 = COPY %2 S_SETPC_B64_return $sgpr30_sgpr31 Index: llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -84,7 +84,7 @@ bb.0: %0:sreg_32_xm0 = COPY $sgpr32 %1:vreg_64 = IMPLICIT_DEF - %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit undef $vgpr0 Index: llvm/test/CodeGen/AMDGPU/shrink-carry.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/shrink-carry.mir +++ llvm/test/CodeGen/AMDGPU/shrink-carry.mir @@ -21,7 +21,7 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec ... @@ -46,7 +46,7 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec %4, %5 = V_SUBB_U32_e64 %0, 0, %3, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec ... @@ -71,7 +71,7 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec %4, %5 = V_ADDC_U32_e64 0, %0, %3, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec ... @@ -96,6 +96,6 @@ %2 = IMPLICIT_DEF %3 = V_CMP_GT_U32_e64 %0, %1, implicit $exec %4, %5 = V_ADDC_U32_e64 %0, 0, %3, 0, implicit $exec - GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, %4, 0, 0, 0, 0, 0, implicit $exec ... Index: llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -81,11 +81,11 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_ADD_CO_U32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -165,11 +165,11 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUB_CO_U32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -249,11 +249,11 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUBREV_CO_U32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -332,12 +332,12 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %9 = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, %9, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -417,12 +417,12 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec $vcc = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -502,11 +502,11 @@ %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64_e64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec %29, $vcc = V_ADDC_U32_e64 %19, %17, undef $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir +++ llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir @@ -15,7 +15,7 @@ ; CHECK: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec ; CHECK: renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec ; CHECK: renamable $vgpr1 = V_ADDC_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit killed $vcc, implicit $exec - ; CHECK: renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + ; CHECK: renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) ; CHECK: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4) ; CHECK: S_WAITCNT 112 ; CHECK: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec @@ -61,7 +61,7 @@ $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec renamable $vgpr0 = V_ADD_CO_U32_e32 $sgpr0, killed $vgpr0, implicit-def $vcc, implicit $exec renamable $vgpr1 = V_ADDC_U32_e32 0, killed $vgpr1, implicit-def $vcc, implicit killed $vcc, implicit $exec - renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) + renamable $vgpr0 = FLAT_LOAD_DWORD renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4, addrspace 1) renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0, 0 :: (dereferenceable invariant load 8, align 16, addrspace 4) S_WAITCNT 112 V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec Index: llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir +++ llvm/test/CodeGen/AMDGPU/spill-agpr-partially-undef.mir @@ -17,9 +17,9 @@ ; CHECK-LABEL: name: spill_a64_kill ; CHECK: liveins: $agpr0_agpr1 ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... @@ -41,9 +41,9 @@ ; CHECK-LABEL: name: spill_a64_undef_sub1_killed ; CHECK: liveins: $agpr0 ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... @@ -63,8 +63,8 @@ ; CHECK-LABEL: name: spill_a64_undef_sub0_killed ; CHECK: liveins: $agpr1 ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $agpr0_agpr1 :: (store 4 into %stack.0, addrspace 5) ; CHECK: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $agpr0_agpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... Index: llvm/test/CodeGen/AMDGPU/spill-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,15 +1,23 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908-A2M,A2M %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX90A-A2M,A2M %s ; GCN-LABEL: {{^}}max_24regs_32a_used: -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; A2V-NOT: SCRATCH_RSRC -; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse -; A2V: ScratchSize: 0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GCN-DAG: v_mfma_f32_16x16x1f32 +; GCN-DAG: v_mfma_f32_16x16x1f32 +; A2V-NOT: SCRATCH_RSRC +; GFX908-A2M-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse +; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse +; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse +; GFX90A-NOT: v_accvgpr_write_b32 +; A2V: ScratchSize: 0 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { bb: %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg @@ -35,8 +43,10 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX90A-A2M: buffer_store_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX90A-A2M: buffer_load_dword a{{[0-9]+}}, off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { @@ -60,17 +70,17 @@ } ; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 +; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 +; GFX908-A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; A2V: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} ; Reload Reuse ; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse ; A2V: ScratchSize: 0 -; A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload -; A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse +; GFX908-A2M: buffer_store_dword v[[VSPILLSTORE:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX908-A2M: buffer_load_dword v[[VSPILL_RELOAD:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX908-A2M: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL_RELOAD]] ; Reload Reuse define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { %a1 = call <4 x i32> asm sideeffect "", "=a"() %a2 = call <4 x i32> asm sideeffect "", "=a"() @@ -86,9 +96,12 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 ; Reload Reuse -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload +; GFX908-A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; GFX90A-NOT: v_accvgpr_read_b32 +; GFX90A: v_mfma_f32_32x32x1f32 +; GFX908-A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; Reload Reuse +; GFX90A-NOT: v_accvgpr_write_b32 ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { bb: Index: llvm/test/CodeGen/AMDGPU/spill-agpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-agpr.mir +++ llvm/test/CodeGen/AMDGPU/spill-agpr.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s -# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GFX908-SPILLED %s +# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=GFX908-EXPANDED %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regallocfast -o - %s | FileCheck -check-prefix=GFX90A-SPILLED %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=regallocfast,si-lower-sgpr-spills,prologepilog -o - %s | FileCheck -check-prefix=GFX90A-EXPANDED %s --- name: spill_restore_agpr32 @@ -9,6 +11,72 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr32 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; GFX908-SPILLED: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; GFX90A-SPILLED: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 ; SPILLED-LABEL: name: spill_restore_agpr32 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -61,6 +129,64 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr64 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1 + ; GFX908-SPILLED: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr64 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1 + ; GFX90A-SPILLED: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 8 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr0_agpr1 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1 ; SPILLED-LABEL: name: spill_restore_agpr64 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -108,6 +234,122 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247 + ; GFX908-SPILLED: S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX908-EXPANDED: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; GFX908-EXPANDED: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX908-EXPANDED: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247 + ; GFX908-EXPANDED: S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-SPILLED: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247 + ; GFX90A-SPILLED: S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0 + ; GFX90A-EXPANDED: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247 + ; GFX90A-EXPANDED: S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0 ; SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -203,6 +445,68 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr96 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 + ; GFX908-SPILLED: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr96 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 + ; GFX90A-SPILLED: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store 12 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 12 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 ; SPILLED-LABEL: name: spill_restore_agpr96 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -252,6 +556,72 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr128 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX908-SPILLED: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr128 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-SPILLED: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 ; SPILLED-LABEL: name: spill_restore_agpr128 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -303,6 +673,76 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr160 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-SPILLED: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr160 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-SPILLED: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store 20 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 20 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 ; SPILLED-LABEL: name: spill_restore_agpr160 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -356,6 +796,80 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr192 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-SPILLED: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr192 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-SPILLED: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store 24 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 24 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; SPILLED-LABEL: name: spill_restore_agpr192 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -411,6 +925,88 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr256 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-SPILLED: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr256 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-SPILLED: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store 32 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 32 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; SPILLED-LABEL: name: spill_restore_agpr256 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -470,6 +1066,120 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr512 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-SPILLED: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr512 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-SPILLED: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store 64 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 64 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; SPILLED-LABEL: name: spill_restore_agpr512 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) @@ -545,6 +1255,184 @@ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 stackPtrOffsetReg: $sgpr32 body: | + ; GFX908-SPILLED-LABEL: name: spill_restore_agpr1024 + ; GFX908-SPILLED: bb.0: + ; GFX908-SPILLED: successors: %bb.1(0x80000000) + ; GFX908-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-SPILLED: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-SPILLED: bb.1: + ; GFX908-SPILLED: successors: %bb.2(0x80000000) + ; GFX908-SPILLED: S_NOP 1 + ; GFX908-SPILLED: bb.2: + ; GFX908-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024 + ; GFX908-EXPANDED: bb.0: + ; GFX908-EXPANDED: successors: %bb.1(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX908-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX908-EXPANDED: bb.1: + ; GFX908-EXPANDED: successors: %bb.2(0x80000000) + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX908-EXPANDED: S_NOP 1 + ; GFX908-EXPANDED: bb.2: + ; GFX908-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX908-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX908-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr1024 + ; GFX90A-SPILLED: bb.0: + ; GFX90A-SPILLED: successors: %bb.1(0x80000000) + ; GFX90A-SPILLED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-SPILLED: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store 128 into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-SPILLED: bb.1: + ; GFX90A-SPILLED: successors: %bb.2(0x80000000) + ; GFX90A-SPILLED: S_NOP 1 + ; GFX90A-SPILLED: bb.2: + ; GFX90A-SPILLED: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 128 from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024 + ; GFX90A-EXPANDED: bb.0: + ; GFX90A-EXPANDED: successors: %bb.1(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX90A-EXPANDED: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr6 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr8 = V_ACCVGPR_READ_B32_e64 killed $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr9 = V_ACCVGPR_READ_B32_e64 killed $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr10 = V_ACCVGPR_READ_B32_e64 killed $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr11 = V_ACCVGPR_READ_B32_e64 killed $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr12 = V_ACCVGPR_READ_B32_e64 killed $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr13 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr14 = V_ACCVGPR_READ_B32_e64 killed $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr16 = V_ACCVGPR_READ_B32_e64 killed $agpr16, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr17 = V_ACCVGPR_READ_B32_e64 killed $agpr17, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr18 = V_ACCVGPR_READ_B32_e64 killed $agpr18, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr19 = V_ACCVGPR_READ_B32_e64 killed $agpr19, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr20 = V_ACCVGPR_READ_B32_e64 killed $agpr20, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr21 = V_ACCVGPR_READ_B32_e64 killed $agpr21, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr22 = V_ACCVGPR_READ_B32_e64 killed $agpr22, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr23 = V_ACCVGPR_READ_B32_e64 killed $agpr23, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr24 = V_ACCVGPR_READ_B32_e64 killed $agpr24, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr25 = V_ACCVGPR_READ_B32_e64 killed $agpr25, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr26 = V_ACCVGPR_READ_B32_e64 killed $agpr26, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr27 = V_ACCVGPR_READ_B32_e64 killed $agpr27, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr28 = V_ACCVGPR_READ_B32_e64 killed $agpr28, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr29 = V_ACCVGPR_READ_B32_e64 killed $agpr29, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr30 = V_ACCVGPR_READ_B32_e64 killed $agpr30, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $vgpr31 = V_ACCVGPR_READ_B32_e64 killed $agpr31, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: S_CBRANCH_SCC1 %bb.1, implicit undef $scc + ; GFX90A-EXPANDED: bb.1: + ; GFX90A-EXPANDED: successors: %bb.2(0x80000000) + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX90A-EXPANDED: S_NOP 1 + ; GFX90A-EXPANDED: bb.2: + ; GFX90A-EXPANDED: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + ; GFX90A-EXPANDED: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $vgpr16, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $vgpr17, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $vgpr18, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $vgpr19, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $vgpr20, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $vgpr21, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $vgpr22, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $vgpr23, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $vgpr24, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $vgpr25, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $vgpr26, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $vgpr27, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $vgpr28, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $vgpr29, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr30, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr31, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 + ; GFX90A-EXPANDED: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; SPILLED-LABEL: name: spill_restore_agpr1024 ; SPILLED: bb.0: ; SPILLED: successors: %bb.1(0x80000000) Index: llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -21,12 +21,18 @@ ; GCN-LABEL: name: spill_sgpr128_use_subreg ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN: $sgpr8_sgpr9 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN: renamable $sgpr1 = COPY $sgpr2 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: renamable $sgpr8 = COPY killed renamable $sgpr1 + ; GCN: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; GCN: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; GCN: S_ENDPGM 0, implicit $sgpr8 renamable $sgpr1 = COPY $sgpr2 SI_SPILL_S128_SAVE renamable $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) @@ -51,11 +57,17 @@ ; GCN-LABEL: name: spill_sgpr128_use_kill ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN: $sgpr8_sgpr9 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.1, addrspace 5) + ; GCN: $exec = S_MOV_B64 killed $sgpr8_sgpr9 ; GCN: renamable $sgpr1 = COPY $sgpr2 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def $scc, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.1, addrspace 5) + ; GCN: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; GCN: S_ENDPGM 0 renamable $sgpr1 = COPY $sgpr2 SI_SPILL_S128_SAVE renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.0, align 4, addrspace 5) @@ -79,10 +91,10 @@ ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN: renamable $vgpr1 = COPY $vgpr2 - ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) ; GCN: renamable $vgpr8 = COPY killed renamable $vgpr1 ; GCN: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 @@ -108,10 +120,10 @@ ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN: renamable $vgpr1 = COPY $vgpr2 - ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) ; GCN: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) Index: llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir +++ llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir @@ -50,7 +50,7 @@ ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc ; GFX9: $vcc = S_MOV_B64 $exec ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; GFX9: $exec = S_MOV_B64 $vcc ; GFX9: $vcc_hi = V_READLANE_B32 $vgpr0, 1 ; GFX9: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0 @@ -59,11 +59,11 @@ ; GFX9: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc ; GFX9: $vcc = S_MOV_B64 $exec ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; GFX9: $exec = S_MOV_B64 killed $vcc ; GFX9: $vcc = S_MOV_B64 $exec ; GFX9: $exec = S_MOV_B64 3 - ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; GFX9: $exec = S_MOV_B64 killed $vcc ; GFX9: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc ; GFX9: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1 @@ -81,7 +81,7 @@ ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc ; GFX10: $vcc = S_MOV_B64 $exec ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX10: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; GFX10: $exec = S_MOV_B64 $vcc ; GFX10: $vcc_hi = V_READLANE_B32 $vgpr0, 1 ; GFX10: $vcc_lo = V_READLANE_B32 killed $vgpr0, 0 @@ -90,11 +90,11 @@ ; GFX10: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc ; GFX10: $vcc = S_MOV_B64 $exec ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GFX10: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; GFX10: $exec = S_MOV_B64 killed $vcc ; GFX10: $vcc = S_MOV_B64 $exec ; GFX10: $exec = S_MOV_B64 3 - ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; GFX10: $exec = S_MOV_B64 killed $vcc ; GFX10: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc ; GFX10: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1 Index: llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir +++ llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir @@ -24,10 +24,10 @@ ; CHECK: %3.sub2:sgpr_128 = COPY %2.sub2 ; CHECK: %3.sub3:sgpr_128 = COPY %2.sub3 ; CHECK: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec { - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) ; CHECK: } ; CHECK: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, align 4, addrspace 5) @@ -56,26 +56,26 @@ ; CHECK: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec ; CHECK: undef %117.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %117, %stack.10, $sgpr32, 0, implicit $exec :: (store 16 into %stack.10, align 4, addrspace 5) - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) ; CHECK: undef %122.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec ; CHECK: undef %126.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec ; CHECK: undef %130.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %130, %stack.11, $sgpr32, 0, implicit $exec :: (store 16 into %stack.11, align 4, addrspace 5) ; CHECK: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %135, %stack.12, $sgpr32, 0, implicit $exec :: (store 16 into %stack.12, align 4, addrspace 5) - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) ; CHECK: undef %140.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec ; CHECK: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %144, %stack.13, $sgpr32, 0, implicit $exec :: (store 16 into %stack.13, align 4, addrspace 5) ; CHECK: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec ; CHECK: SI_SPILL_V128_SAVE %149, %stack.14, $sgpr32, 0, implicit $exec :: (store 16 into %stack.14, align 4, addrspace 5) ; CHECK: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) ; CHECK: undef %158.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec ; CHECK: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec ; CHECK: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec ; CHECK: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec - ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + ; CHECK: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) ; CHECK: undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec ; CHECK: undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec ; CHECK: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec @@ -155,190 +155,190 @@ ; CHECK: %43.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec ; CHECK: %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec ; CHECK: %43.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: %42.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %42.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: %41.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %41.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) ; CHECK: %40.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %40.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: %38.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %38.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: %37.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %37.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: %36.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %36.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) ; CHECK: undef %157.sub0:vreg_128 = COPY %159.sub0 { ; CHECK: internal %157.sub2:vreg_128 = COPY %159.sub2 ; CHECK: } ; CHECK: %157.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %157.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: undef %153.sub0:vreg_128 = COPY %155.sub0 { ; CHECK: internal %153.sub2:vreg_128 = COPY %155.sub2 ; CHECK: } ; CHECK: %153.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %153.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 352, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load 16 from %stack.14, align 4, addrspace 5) ; CHECK: undef %148.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 { ; CHECK: internal %148.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2 ; CHECK: } ; CHECK: %148.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %148.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 368, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load 16 from %stack.13, align 4, addrspace 5) ; CHECK: undef %143.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 { ; CHECK: internal %143.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2 ; CHECK: } ; CHECK: %143.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %143.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) ; CHECK: undef %139.sub0:vreg_128 = COPY %141.sub0 { ; CHECK: internal %139.sub2:vreg_128 = COPY %141.sub2 ; CHECK: } ; CHECK: %139.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %139.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %139, %2, 0, 336, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load 16 from %stack.12, align 4, addrspace 5) ; CHECK: undef %134.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 { ; CHECK: internal %134.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2 ; CHECK: } ; CHECK: %134.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %134.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load 16 from %stack.11, align 4, addrspace 5) ; CHECK: undef %129.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 { ; CHECK: internal %129.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2 ; CHECK: } ; CHECK: %129.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %129.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %129, %2, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: undef %125.sub0:vreg_128 = COPY %127.sub0 { ; CHECK: internal %125.sub2:vreg_128 = COPY %127.sub2 ; CHECK: } ; CHECK: %125.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %125.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %125, %2, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) ; CHECK: undef %121.sub0:vreg_128 = COPY %123.sub0 { ; CHECK: internal %121.sub2:vreg_128 = COPY %123.sub2 ; CHECK: } ; CHECK: %121.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %121.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %121, %2, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load 16 from %stack.10, align 4, addrspace 5) ; CHECK: undef %116.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 { ; CHECK: internal %116.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2 ; CHECK: } ; CHECK: %116.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %116.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %116, %2, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: undef %112.sub0:vreg_128 = COPY %114.sub0 { ; CHECK: internal %112.sub2:vreg_128 = COPY %114.sub2 ; CHECK: } ; CHECK: %112.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %112.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: undef %108.sub0:vreg_128 = COPY %110.sub0 { ; CHECK: internal %108.sub2:vreg_128 = COPY %110.sub2 ; CHECK: } ; CHECK: %108.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %108.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %108, %2, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) ; CHECK: undef %104.sub0:vreg_128 = COPY %106.sub0 { ; CHECK: internal %104.sub2:vreg_128 = COPY %106.sub2 ; CHECK: } ; CHECK: %104.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %104.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %104, %2, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load 16 from %stack.9, align 4, addrspace 5) ; CHECK: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 { ; CHECK: internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2 ; CHECK: } ; CHECK: %99.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %99.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load 16 from %stack.8, align 4, addrspace 5) ; CHECK: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 { ; CHECK: internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2 ; CHECK: } ; CHECK: %94.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %94.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load 16 from %stack.7, align 4, addrspace 5) ; CHECK: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 { ; CHECK: internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2 ; CHECK: } ; CHECK: %89.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %89.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %89, %2, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) ; CHECK: undef %85.sub0:vreg_128 = COPY %87.sub0 { ; CHECK: internal %85.sub2:vreg_128 = COPY %87.sub2 ; CHECK: } ; CHECK: %85.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %85.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %85, %2, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load 16 from %stack.6, align 4, addrspace 5) ; CHECK: undef %80.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 { ; CHECK: internal %80.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2 ; CHECK: } ; CHECK: %80.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %80.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %80, %2, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load 16 from %stack.5, align 4, addrspace 5) ; CHECK: undef %75.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 { ; CHECK: internal %75.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2 ; CHECK: } ; CHECK: %75.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %75.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %75, %2, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load 16 from %stack.4, align 4, addrspace 5) ; CHECK: undef %70.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 { ; CHECK: internal %70.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2 ; CHECK: } ; CHECK: %70.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %70.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) ; CHECK: undef %66.sub0:vreg_128 = COPY %68.sub0 { ; CHECK: internal %66.sub2:vreg_128 = COPY %68.sub2 ; CHECK: } ; CHECK: %66.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %66.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load 16 from %stack.3, align 4, addrspace 5) ; CHECK: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 { ; CHECK: internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2 ; CHECK: } ; CHECK: %61.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %61.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load 16 from %stack.2, align 4, addrspace 5) ; CHECK: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 { ; CHECK: internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2 ; CHECK: } ; CHECK: %56.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %56.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load 16 from %stack.1, align 4, addrspace 5) ; CHECK: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 { ; CHECK: internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2 ; CHECK: } ; CHECK: %51.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %51.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) ; CHECK: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 16 from %stack.0, align 4, addrspace 5) ; CHECK: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 { ; CHECK: internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2 ; CHECK: } ; CHECK: %46.sub1:vreg_128 = COPY %43.sub1 ; CHECK: %46.sub3:vreg_128 = COPY %43.sub1 - ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + ; CHECK: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) ; CHECK: S_ENDPGM 0 %0:sgpr_64(p4) = COPY $sgpr0_sgpr1 %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0, 0 :: (dereferenceable invariant load 16, align 4, addrspace 4) @@ -351,10 +351,10 @@ %3.sub2:sgpr_128 = COPY %2.sub2 %3.sub3:sgpr_128 = COPY %2.sub3 early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec { - %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) - %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) - %4:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) - %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %7:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 128, addrspace 1) + %5:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %4:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + %6:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) } undef %8.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub1, implicit $exec undef %9.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %7.sub0, implicit $exec @@ -372,22 +372,22 @@ undef %21.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub0, implicit $exec undef %22.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub3, implicit $exec undef %23.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %6.sub2, implicit $exec - %24:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) + %24:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 64, addrspace 1) undef %25.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub1, implicit $exec undef %26.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub0, implicit $exec undef %27.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub3, implicit $exec undef %28.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %24.sub2, implicit $exec - %29:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %29:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) undef %30.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub1, implicit $exec undef %31.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub0, implicit $exec undef %32.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub3, implicit $exec undef %33.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %29.sub2, implicit $exec - %34:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) + %34:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, align 32, addrspace 1) undef %35.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub1, implicit $exec undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub0, implicit $exec undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub3, implicit $exec undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %34.sub2, implicit $exec - %39:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) + %39:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16, addrspace 1) undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub1, implicit $exec undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub0, implicit $exec undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, %39.sub3, implicit $exec @@ -427,99 +427,99 @@ %43.sub0:vreg_128 = V_AND_B32_e32 %44, %39.sub2, implicit $exec %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec %43.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %42.sub1:vreg_128 = COPY %43.sub1 %42.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %41.sub1:vreg_128 = COPY %43.sub1 %41.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) %40.sub1:vreg_128 = COPY %43.sub1 %40.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %38.sub1:vreg_128 = COPY %43.sub1 %38.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %37.sub1:vreg_128 = COPY %43.sub1 %37.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %36.sub1:vreg_128 = COPY %43.sub1 %36.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) %35.sub1:vreg_128 = COPY %43.sub1 %35.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %35, %2, 0, 400, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %35, %2, 0, 400, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %33.sub1:vreg_128 = COPY %43.sub1 %33.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %33, %2, 0, 352, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %33, %2, 0, 352, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %32.sub1:vreg_128 = COPY %43.sub1 %32.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %32, %2, 0, 368, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %32, %2, 0, 368, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %31.sub1:vreg_128 = COPY %43.sub1 %31.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %31, %2, 0, 320, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %31, %2, 0, 320, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) %30.sub1:vreg_128 = COPY %43.sub1 %30.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %30, %2, 0, 336, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %30, %2, 0, 336, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %28.sub1:vreg_128 = COPY %43.sub1 %28.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %28, %2, 0, 288, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %28, %2, 0, 288, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %27.sub1:vreg_128 = COPY %43.sub1 %27.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %27, %2, 0, 304, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %27, %2, 0, 304, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %26.sub1:vreg_128 = COPY %43.sub1 %26.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %26, %2, 0, 256, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %26, %2, 0, 256, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 256, addrspace 1) %25.sub1:vreg_128 = COPY %43.sub1 %25.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %25, %2, 0, 272, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %25, %2, 0, 272, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %23.sub1:vreg_128 = COPY %43.sub1 %23.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %23, %2, 0, 224, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %23, %2, 0, 224, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %22.sub1:vreg_128 = COPY %43.sub1 %22.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %22, %2, 0, 240, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %22, %2, 0, 240, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %21.sub1:vreg_128 = COPY %43.sub1 %21.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %21, %2, 0, 192, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %21, %2, 0, 192, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) %20.sub1:vreg_128 = COPY %43.sub1 %20.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %20, %2, 0, 208, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %20, %2, 0, 208, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %19.sub1:vreg_128 = COPY %43.sub1 %19.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %18.sub1:vreg_128 = COPY %43.sub1 %18.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %18, %2, 0, 176, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %18, %2, 0, 176, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %17.sub1:vreg_128 = COPY %43.sub1 %17.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %17, %2, 0, 128, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %17, %2, 0, 128, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 128, addrspace 1) %16.sub1:vreg_128 = COPY %43.sub1 %16.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %16, %2, 0, 144, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %16, %2, 0, 144, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %15.sub1:vreg_128 = COPY %43.sub1 %15.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %15, %2, 0, 96, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %15, %2, 0, 96, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %14.sub1:vreg_128 = COPY %43.sub1 %14.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %14, %2, 0, 112, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %14, %2, 0, 112, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %13.sub1:vreg_128 = COPY %43.sub1 %13.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %13, %2, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %13, %2, 0, 64, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 64, addrspace 1) %12.sub1:vreg_128 = COPY %43.sub1 %12.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %12, %2, 0, 80, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %12, %2, 0, 80, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %11.sub1:vreg_128 = COPY %43.sub1 %11.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %11, %2, 0, 32, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %11, %2, 0, 32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 32, addrspace 1) %10.sub1:vreg_128 = COPY %43.sub1 %10.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %10, %2, 0, 48, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %10, %2, 0, 48, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) %9.sub1:vreg_128 = COPY %43.sub1 %9.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %9, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %9, %2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, align 512, addrspace 1) %8.sub1:vreg_128 = COPY %43.sub1 %8.sub3:vreg_128 = COPY %43.sub1 - BUFFER_STORE_DWORDX4_OFFSET %8, %2, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) + BUFFER_STORE_DWORDX4_OFFSET %8, %2, 0, 16, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16, addrspace 1) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -46,9 +46,9 @@ ; CHECK: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK: %71.sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK: early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: } ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 @@ -110,9 +110,9 @@ ; CHECK: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0, 0 :: (load 16 from %ir.123, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0, 0 :: (load 16 from %ir.131, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0, 0 :: (load 16 from %ir.138, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR]], -98, implicit-def dead $scc @@ -129,17 +129,17 @@ ; CHECK: %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY9]], 4, implicit-def dead $scc ; CHECK: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0, 0 :: (load 16 from %ir.155, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0, 0 :: (load 16 from %ir.144, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0, 0 :: (load 16 from %ir.150, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0, 0 :: (load 16 from %ir.162, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0, 0 :: (load 16 from %ir.170, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR4]], -217, implicit-def dead $scc ; CHECK: [[S_ADD_I32_8:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR3]], -233, implicit-def dead $scc ; CHECK: [[S_ADD_I32_9:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR5]], -249, implicit-def dead $scc @@ -153,48 +153,48 @@ ; CHECK: undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK: undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc ; CHECK: %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc ; CHECK: [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc ; CHECK: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK: undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc ; CHECK: %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %441, 0, 0, 0 :: (load 4 from %ir..i085.i, align 8, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0, 0 :: (load 16 from %ir.176, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0, 0 :: (load 16 from %ir.185, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0, 0 :: (load 16 from %ir.194, addrspace 4) ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0, 0 :: (load 16 from %ir.200, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK: undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc ; CHECK: %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %453, 0, 0, 0 :: (load 8 from %ir.304, addrspace 4) - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0, 0 :: (load 16 from %ir.223, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0, 0 :: (load 16 from %ir.230, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0, 0 :: (load 16 from %ir.236, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0, 0 :: (load 16 from %ir.242, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 3, implicit-def dead $scc - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK: undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc @@ -214,8 +214,8 @@ ; CHECK: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4) ; CHECK: early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: } ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) @@ -238,9 +238,9 @@ ; CHECK: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4) ; CHECK: early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) ; CHECK: } ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec Index: llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -4,16 +4,10 @@ # CHECK-LABEL: name: no_merge_sgpr_vgpr_spill_slot{{$}} # CHECK: stack: # CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, -# CHECK-NEXT: stack-id: default, - -# CHECK: - { id: 1, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, # CHECK-NEXT: stack-id: sgpr-spill, -# CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) - -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.0, implicit $exec, implicit $sgpr32 :: (store 4 into %stack.0, addrspace 5) +# CHECK: renamable $sgpr5 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 4 from %stack.0, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true @@ -23,8 +17,8 @@ stackPtrOffsetReg: $sgpr32 body: | bb.0: - %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $flat_scr, implicit $exec - %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, implicit $flat_scr, implicit $exec + %2:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $flat_scr, implicit $exec + %0:vgpr_32 = FLAT_LOAD_DWORD undef $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $flat_scr, implicit $exec S_NOP 0, implicit %0 %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0 %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef $sgpr0_sgpr1, 0, 0, 0 Index: llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -97,7 +97,7 @@ %11.sub5:sgpr_256 = COPY %11.sub0 %11.sub6:sgpr_256 = COPY %11.sub0 %11.sub7:sgpr_256 = COPY %11.sub0 - %12:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %9, %11, undef %13:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) + %12:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %9, %11, undef %13:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) %14:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec %15:vreg_128 = IMPLICIT_DEF S_CBRANCH_SCC1 %bb.8, implicit undef $scc @@ -268,7 +268,7 @@ %62:vgpr_32 = V_MOV_B32_e32 1033100696, implicit $exec %63:vgpr_32 = V_MUL_F32_e32 1060575065, %15.sub1, implicit $mode, implicit $exec %63:vgpr_32 = V_MAC_F32_e32 1046066128, %15.sub0, %63, implicit $mode, implicit $exec - %64:vgpr_32 = IMAGE_LOAD_V1_V2 %60, %61, 1, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) + %64:vgpr_32 = IMAGE_LOAD_V1_V2 %60, %61, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, addrspace 4) %64:vgpr_32 = V_MAC_F32_e32 target-flags(amdgpu-gotprel) 0, %51.sub0, %64, implicit $mode, implicit $exec %65:vgpr_32 = V_MUL_F32_e32 0, %64, implicit $mode, implicit $exec %66:vgpr_32 = V_MUL_F32_e32 0, %65, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/subvector-test.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/subvector-test.mir +++ llvm/test/CodeGen/AMDGPU/subvector-test.mir @@ -31,6 +31,6 @@ bb.2: - GLOBAL_STORE_DWORD %15, %16, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %15, %16, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/syncscopes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/syncscopes.ll +++ llvm/test/CodeGen/AMDGPU/syncscopes.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-insert-skips < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: name: syncscopes -; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out) -; GCN: FLAT_STORE_DWORD killed renamable $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) -; GCN: FLAT_STORE_DWORD killed renamable $vgpr7_vgpr8, killed renamable $vgpr6, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) +; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out) +; GCN: FLAT_STORE_DWORD killed renamable $vgpr4_vgpr5, killed renamable $vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) +; GCN: FLAT_STORE_DWORD killed renamable $vgpr7_vgpr8, killed renamable $vgpr6, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) define void @syncscopes( i32 %agent, i32* %agent_out, Index: llvm/test/CodeGen/AMDGPU/tgsplit.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/tgsplit.ll @@ -0,0 +1,11 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,NOTGSPLIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GCN,TGSPLIT %s + +; GCN-LABEL: .amdhsa_kernel test +; NOTGSPLIT: .amdhsa_tg_split 0 +; NOTGSPLIT: COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0 +; TGSPLIT: .amdhsa_tg_split 1 +; TGSPLIT: COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 1 +define amdgpu_kernel void @test() { + ret void +} Index: llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -25,7 +25,7 @@ ; GCN: bb.2.else: ; GCN: successors: ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: bb.3: entry: %cc = icmp sgt i32 %a, 0 @@ -61,7 +61,7 @@ ; GCN: bb.4.else: ; GCN: successors: ; GCN: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) ; GCN: bb.5: entry: %cc = icmp sgt i32 %a, 0 Index: llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/twoaddr-fma-f64.mir @@ -0,0 +1,186 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: test_fmamk_reg_imm_f64 +# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmamk_reg_imm_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0 + %2 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %3 = V_FMAC_F64_e32 killed %0, %2, killed %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_imm_reg_f64 +# GCN: V_FMA_F64_e64 0, %2, 0, killed %0.sub0_sub1, 0, killed %1, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmamk_imm_reg_f64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0.sub2_sub3 + %2 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %3 = V_FMAC_F64_e32 %2, killed %0.sub0_sub1, killed %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_f64 +# GCN: V_FMA_F64_e64 0, killed %0.sub0_sub1, 0, %0.sub2_sub3, 0, %1, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmaak_f64 +registers: + - { id: 0, class: vreg_128 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %2 = V_FMAC_F64_e32 killed %0.sub0_sub1, %0.sub2_sub3, %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_sgpr_src0_f64 +# GCN: V_FMA_F64_e64 0, killed %0, 0, %1, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec + +--- +name: test_fmaak_sgpr_src0_f64 +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %3 = V_FMAC_F64_e32 killed %0, %1, %2, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_inlineimm_src0_f64 +# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec + +--- +name: test_fmaak_inlineimm_src0_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } +body: | + bb.0: + + %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %2 = V_FMAC_F64_e32 4611686018427387904, %0, %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_otherimm_src0_f64 +# GCN: V_FMA_F64_e64 0, 4611686018427387904, 0, %0, 0, %1:vreg_64, 0, 0, implicit $mode, implicit $exec + +--- +name: test_fmaak_otherimm_src0_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } +body: | + bb.0: + + %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %2 = V_FMAC_F64_e32 4611686018427387904, %0, %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_other_constantlike_src0_f64 +# GCN: V_FMAC_F64_e32 %stack.0, %0, %2, implicit $mode, implicit $exec +--- +name: test_fmaak_other_constantlike_src0_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } +stack: + - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8, + callee-saved-register: '', local-offset: 0, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } +body: | + bb.0: + + %0 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec + %2 = V_FMAC_F64_e32 %stack.0, %0, %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src0_f64 +# GCN: V_FMA_F64_e64 0, %2, 0, killed %0, 0, killed %1, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmamk_reg_unfoldable_literal_src0_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0 + %2 = V_MOV_B64_PSEUDO 123456, implicit $exec + %3 = V_FMAC_F64_e32 %2, killed %0, killed %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmamk_reg_unfoldable_literal_src1_f64 +# GCN: V_FMA_F64_e64 0, killed %0, 0, %2, 0, killed %1, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmamk_reg_unfoldable_literal_src1_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0 + %2 = V_MOV_B64_PSEUDO 123456, implicit $exec + %3 = V_FMAC_F64_e32 killed %0, %2, killed %1, implicit $mode, implicit $exec + +... + +# GCN-LABEL: name: test_fmaak_reg_unfoldable_literal_src2_f64 +# GCN: V_FMA_F64_e64 0, killed %0, 0, killed %1, 0, %2, 0, 0, implicit $mode, implicit $exec +--- +name: test_fmaak_reg_unfoldable_literal_src2_f64 +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: vreg_64 } + - { id: 3, class: vreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = COPY %0 + %2 = V_MOV_B64_PSEUDO 123456, implicit $exec + %3 = V_FMAC_F64_e32 killed %0, killed %1, %2, implicit $mode, implicit $exec + +... Index: llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir +++ llvm/test/CodeGen/AMDGPU/undefined-physreg-sgpr-spill.mir @@ -44,7 +44,7 @@ liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13 $vgpr1_vgpr2 = COPY killed $sgpr4_sgpr5, implicit $exec - $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`) + $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`) $vcc = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 1, killed $vgpr1, implicit $exec $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed $sgpr0_sgpr1, implicit $exec @@ -109,7 +109,7 @@ liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr13 $vgpr1_vgpr2 = COPY killed $sgpr4_sgpr5, implicit $exec - $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`) + $vgpr1 = GLOBAL_LOAD_UBYTE killed $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal dereferenceable invariant load 1 from `i1 addrspace(4)* undef`) $vcc = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 1, killed $vgpr1, implicit $exec $vgpr1 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed $sgpr0_sgpr1, implicit $exec Index: llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir @@ -0,0 +1,80 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX900 %s +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX90A %s + +# GCN-LABEL: name: v_mov_b64_from_vgpr +# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec +name: v_mov_b64_from_vgpr +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO $vgpr2_vgpr3, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_sgpr +# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec +name: v_mov_b64_from_sgpr +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO $sgpr2_sgpr3, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_sext_inline_imm +# GFX900: $vgpr0 = V_MOV_B32_e32 4294967294, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr0 = V_MOV_B32_e32 4294967294, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +name: v_mov_b64_from_sext_inline_imm +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744073709551614, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_lit +# GCN: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr1 = V_MOV_B32_e32 4294734465, implicit $exec, implicit-def $vgpr0_vgpr1 +name: v_mov_b64_from_lit +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18445744073609551614, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_first_inline_imm +# GCN: $vgpr0 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1 +name: v_mov_b64_from_first_inline_imm +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 1152921504606846975, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_second_inline_imm +# GCN: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +name: v_mov_b64_from_second_inline_imm +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744069683019775, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_same_sext_inline_imm +# GFX900: $vgpr0 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr1 = V_MOV_B32_e32 4294967295, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec +name: v_mov_b64_from_same_sext_inline_imm +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 18446744073709551615, implicit $exec +... + +# GCN-LABEL: name: v_mov_b64_from_same_fp_inline_imm +# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec +name: v_mov_b64_from_same_fp_inline_imm +body: | + bb.0: + $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec +... Index: llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -24,7 +24,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -32,7 +32,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3: @@ -40,7 +40,7 @@ $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -65,7 +65,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -73,7 +73,7 @@ liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3: @@ -81,7 +81,7 @@ $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -110,9 +110,9 @@ # instructions to fix vccz. # CHECK-LABEL: name: reload_vcc_from_mem -# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec +# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK: $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc -# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec +# CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK: $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc # SI: $vcc = S_MOV_B64 $vcc # GFX9: $vcc = S_MOV_B64 $vcc @@ -121,9 +121,9 @@ name: reload_vcc_from_mem body: | bb.0: - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec $vcc_lo = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec $vcc_hi = V_READFIRSTLANE_B32 killed $vgpr0, implicit $exec, implicit-def $vcc S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc bb.1: Index: llvm/test/CodeGen/AMDGPU/vgpr-spill.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-spill.mir +++ llvm/test/CodeGen/AMDGPU/vgpr-spill.mir @@ -16,7 +16,7 @@ ; CHECK-LABEL: name: spill_v32 ; CHECK: liveins: $vgpr0 - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ; CHECK: S_NOP 0, implicit $vgpr0 SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) S_NOP 0, implicit $vgpr0 @@ -37,7 +37,7 @@ ; CHECK-LABEL: name: spill_v32_kill ; CHECK: liveins: $vgpr0 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) ... @@ -56,8 +56,8 @@ ; CHECK-LABEL: name: spill_v64 ; CHECK: liveins: $vgpr0_vgpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) ; CHECK: S_NOP 0, implicit $vgpr0_vgpr1 SI_SPILL_V64_SAVE $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) S_NOP 0, implicit $vgpr0_vgpr1 @@ -78,8 +78,8 @@ ; CHECK-LABEL: name: spill_v64_kill ; CHECK: liveins: $vgpr0_vgpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... @@ -100,8 +100,8 @@ ; CHECK-LABEL: name: spill_v64_undef_sub1_killed ; CHECK: liveins: $vgpr0 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... @@ -120,8 +120,8 @@ ; CHECK-LABEL: name: spill_v64_undef_sub0_killed ; CHECK: liveins: $vgpr1 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: (store 4 into %stack.0 + 4, addrspace 5) SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store 8 into %stack.0, addrspace 5) ... @@ -140,9 +140,9 @@ ; CHECK-LABEL: name: spill_v128_kill ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) - ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 4, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 8, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store 4 into %stack.0 + 12, addrspace 5) SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store 16 into %stack.0, addrspace 5) ... Index: llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -29,7 +29,7 @@ body: | bb.0: ; CHECK-LABEL: name: undef_identity_copy - ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) + ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 ; CHECK: $sgpr4 = COPY $sgpr95 @@ -44,9 +44,9 @@ ; CHECK: $vgpr3 = KILL undef renamable $vgpr3 ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0 ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 - ; CHECK: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + ; CHECK: FLAT_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) ; CHECK: S_ENDPGM 0 - %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) + %0:vreg_128 = FLAT_LOAD_DWORDX4 undef %1:vreg_64, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1) %2:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95, implicit-def $scc $sgpr4 = COPY $sgpr95 @@ -62,7 +62,7 @@ dead $sgpr30_sgpr31 = SI_CALL %3, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0 %5:vgpr_32 = COPY $vgpr0 ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95 - FLAT_STORE_DWORD undef %6:vreg_64, %5, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) + FLAT_STORE_DWORD undef %6:vreg_64, %5, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4, addrspace 1) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -11,7 +11,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_write_exec @@ -26,7 +26,7 @@ $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, renamable $vgpr1, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr0, renamable $vgpr1, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_write_sgpr_chain @@ -45,7 +45,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr5 = S_MOV_B32 $sgpr0 $sgpr6 = S_MOV_B32 $sgpr1 $sgpr7 = S_MOV_B32 $sgpr2 @@ -63,7 +63,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 ... # GCN-LABEL: name: vmem_snop_write_sgpr @@ -78,7 +78,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 $sgpr0 = S_MOV_B32 0 ... @@ -93,7 +93,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec $sgpr0 = S_MOV_B32 0 ... @@ -108,7 +108,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 0 $sgpr0 = S_MOV_B32 0 ... @@ -124,7 +124,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 1 $sgpr0 = S_MOV_B32 0 ... @@ -139,7 +139,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: vmem_write_exec_expread @@ -152,7 +152,7 @@ bb.0: $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: ds_write_m0 @@ -181,7 +181,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $sgpr0 = S_MOV_B32 0 @@ -199,7 +199,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -219,7 +219,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.2 bb.1: @@ -247,7 +247,7 @@ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_SCC0 %bb.2, implicit $scc S_BRANCH %bb.1 @@ -275,7 +275,7 @@ $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... # GCN-LABEL: name: ds_write_exec @@ -300,7 +300,7 @@ body: | bb.0: $vgpr0 = IMPLICIT_DEF - $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = SCRATCH_LOAD_DWORD undef $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_flat_exec @@ -313,7 +313,7 @@ bb.0: $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr2 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_global_exec @@ -326,7 +326,7 @@ bb.0: $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec $exec_lo = S_MOV_B32 -1 ... # GCN-LABEL: name: vmem_global_atomic_exec @@ -340,6 +340,6 @@ $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF $vgpr2 = IMPLICIT_DEF - $vgpr3 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1) + $vgpr3 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1) $exec_lo = S_MOV_B32 -1 ... Index: llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir +++ llvm/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -15,7 +15,7 @@ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_to_next # GCN: bb.1: @@ -33,7 +33,7 @@ S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far # GCN: bb.1: @@ -54,7 +54,7 @@ $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops # GCN: bb.1: @@ -71,7 +71,7 @@ S_NOP 4 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_around # GCN: bb.2: @@ -97,7 +97,7 @@ S_NOP 0 bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_backedge # GCN: S_NOP 3 @@ -110,7 +110,7 @@ $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $vgpr0 = IMPLICIT_DEF @@ -139,7 +139,7 @@ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_self_loop # GCN: S_NOP 3 @@ -152,7 +152,7 @@ $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.0 ... @@ -175,7 +175,7 @@ successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... @@ -199,7 +199,7 @@ successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... Index: llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/waitcnt-agpr.mir @@ -0,0 +1,316 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass si-insert-waitcnts -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4, + <4 x i32> addrspace(1)* %global16, + i32* %flat4, + <4 x i32>* %flat16) { + ret void + } + + define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() { + ret void + } + + define amdgpu_kernel void @single_branch_successor_not_next_block() { + ret void + } + + define amdgpu_kernel void @preexisting_waitcnt() { + ret void + } + + define amdgpu_kernel void @bundle_no_waitcnt() { + ret void + } + + define amdgpu_kernel void @preexisting_waitcnt_in_bundle() { + ret void + } + + define amdgpu_kernel void @insert_in_bundle() { + ret void + } + + define amdgpu_kernel void @exit_bundle() { + ret void + } + + define amdgpu_kernel void @cross_bundle() { + ret void + } + +... +--- + + +# Global loads will return in order so we should: +# s_waitcnt vmcnt(1) + +# s_waitcnt vmcnt(0) + +# s_waitcnt vmcnt(0) + +name: flat_zero_waitcnt + +body: | + ; GCN-LABEL: name: flat_zero_waitcnt + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: S_WAITCNT 0 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4, addrspace 1) + ; GCN: $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16, addrspace 1) + ; GCN: S_WAITCNT 3953 + ; GCN: $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + ; GCN: S_BRANCH %bb.1 + ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_WAITCNT 3952 + ; GCN: $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16, addrspace 1) + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.2: + ; GCN: S_WAITCNT 49279 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + ; GCN: S_WAITCNT 3952 + ; GCN: $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16) + ; GCN: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1 + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) + $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) + $agpr0 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) + $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + S_BRANCH %bb.2 + + bb.2: + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $agpr3_agpr4_agpr5_agpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16) + $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + S_ENDPGM 0 +... +--- +# There is only a single fallthrough successor block, so there's no +# need to wait immediately. + + +name: single_fallthrough_successor_no_end_block_wait + +body: | + ; GCN-LABEL: name: single_fallthrough_successor_no_end_block_wait + ; GCN: bb.0: + ; GCN: successors: %bb.1(0x80000000) + ; GCN: S_WAITCNT 0 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: bb.1: + ; GCN: $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec + ; GCN: S_WAITCNT 112 + ; GCN: FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.1 + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + + bb.1: + $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec + FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... +--- +# The block has a single predecessor with a single successor, but it +# is not the next block so it's non-obvious that the wait is not needed. + + + + +name: single_branch_successor_not_next_block + +body: | + ; GCN-LABEL: name: single_branch_successor_not_next_block + ; GCN: bb.0: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: S_WAITCNT 0 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_BRANCH %bb.2 + ; GCN: bb.1: + ; GCN: FLAT_STORE_DWORD $vgpr8_vgpr9, $agpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_ENDPGM 0 + ; GCN: bb.2: + ; GCN: $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec + ; GCN: S_WAITCNT 112 + ; GCN: FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_ENDPGM 0 + bb.0: + successors: %bb.2 + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + S_BRANCH %bb.2 + + bb.1: + FLAT_STORE_DWORD $vgpr8_vgpr9, $agpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 + + bb.2: + $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec + FLAT_STORE_DWORD $vgpr3_vgpr4, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +# GCN-LABEL: name: preexisting_waitcnt{{$}} +# GCN: FLAT_LOAD_DWORD +# GCN-NEXT: S_WAITCNT 0 +# GCN-NOT: S_WAITCNT +name: preexisting_waitcnt +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 0 + FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + +... + +--- + +name: bundle_no_waitcnt +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + ; GCN-LABEL: name: bundle_no_waitcnt + ; GCN: liveins: $vgpr1_vgpr2 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: BUNDLE { + ; GCN: S_NOP 0 + ; GCN: S_NOP 0 + ; GCN: } + ; GCN: S_WAITCNT 112 + ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + BUNDLE { + S_NOP 0 + S_NOP 0 + } + FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + +... + +--- + +# See the waitcnt inside the bundle and don't insert an extra +name: preexisting_waitcnt_in_bundle +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + ; GCN-LABEL: name: preexisting_waitcnt_in_bundle + ; GCN: liveins: $vgpr1_vgpr2 + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: BUNDLE { + ; GCN: S_NOP 0 + ; GCN: S_WAITCNT 0 + ; GCN: } + ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + BUNDLE { + S_NOP 0 + S_WAITCNT 0 + } + FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + +... + +--- + +# Def and use inside bundle + +name: insert_in_bundle +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + ; GCN-LABEL: name: insert_in_bundle + ; GCN: liveins: $vgpr1_vgpr2 + ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: S_WAITCNT 112 + ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: } + BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, internal $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + } +... + +--- + +# Def is last instruction in bundle, use is outside bundle + + +name: exit_bundle +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + ; GCN-LABEL: name: exit_bundle + ; GCN: liveins: $vgpr1_vgpr2 + ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: } + ; GCN: S_WAITCNT 112 + ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + } + + FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + +... + +--- + +# Def is in bundle, use is in another bundle + + +name: cross_bundle +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr1_vgpr2 + ; GCN-LABEL: name: cross_bundle + ; GCN: liveins: $vgpr1_vgpr2 + ; GCN: BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + ; GCN: $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: } + ; GCN: S_WAITCNT 112 + ; GCN: BUNDLE implicit $agpr0, implicit $vgpr1_vgpr2 { + ; GCN: FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + ; GCN: } + BUNDLE implicit-def $agpr0, implicit $vgpr1_vgpr2 { + $agpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + } + BUNDLE implicit $agpr0, implicit $vgpr1_vgpr2 { + FLAT_STORE_DWORD $vgpr1_vgpr2, $agpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + } +... Index: llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir @@ -13,8 +13,8 @@ $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 - $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec $vgpr3 = V_CNDMASK_B32_e64 0, -1082130432, 0, 1065353216, killed $sgpr0_sgpr1, implicit $exec $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec @@ -23,7 +23,7 @@ bb.3: successors: %bb.1 - $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) bb.1: successors: %bb.5, %bb.2 @@ -43,7 +43,7 @@ bb.4: successors: %bb.3, %bb.1 - $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $mode, implicit $exec V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc Index: llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir @@ -78,7 +78,7 @@ bb.1: successors: %bb.2 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: successors: %bb.3, %bb.6 @@ -86,7 +86,7 @@ bb.3: successors: %bb.4, %bb.5 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_VCCNZ %bb.5, implicit $vcc bb.4: @@ -98,6 +98,6 @@ successors: %bb.6 bb.6: - FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir @@ -15,11 +15,11 @@ bb.0: S_BRANCH %bb.1 bb.1: - GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, 0, implicit $exec - $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, implicit $exec - $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, 0, implicit $exec - $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, 0, 0, implicit $exec + $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec + $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, 0, 0, implicit $exec + $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_SCC1 %bb.1, implicit $scc bb.2: S_ENDPGM 0 Index: llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-meta-instructions.mir @@ -12,9 +12,9 @@ liveins: $vgpr0_vgpr1 ; GCN-LABEL: name: waitcnt_kill ; GCN: S_WAITCNT 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: KILL $vgpr0 - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec KILL $vgpr0 ... @@ -27,9 +27,9 @@ liveins: $vgpr0_vgpr1 ; GCN-LABEL: name: waitcnt_implicit_def ; GCN: S_WAITCNT 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = IMPLICIT_DEF - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = IMPLICIT_DEF ... @@ -42,9 +42,9 @@ liveins: $vgpr0_vgpr1, $vgpr2 ; GCN-LABEL: name: waitcnt_eh_label ; GCN: S_WAITCNT 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: EH_LABEL , implicit $vgpr0 - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec EH_LABEL , implicit $vgpr0 ... @@ -58,9 +58,9 @@ liveins: $vgpr0_vgpr1, $vgpr2 ; GCN-LABEL: name: waitcnt_cfi ; GCN: S_WAITCNT 0 - ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec ; GCN: CFI_INSTRUCTION offset $vgpr0_lo16, 16 - $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec CFI_INSTRUCTION offset $vgpr0, 16 ... Index: llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir @@ -17,7 +17,7 @@ bb.1: S_WAITCNT 3952 - FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_ATOMIC_CMPSWAP undef renamable $vgpr0_vgpr1, renamable $vgpr0_vgpr1, 0, 0, 0, implicit $exec, implicit $flat_scr S_WAITCNT 3952 BUFFER_WBINVL1 implicit $exec S_BRANCH %bb.1 Index: llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9,GFX9_10 %s -# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10,GFX9_10 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s --- | define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void } @@ -21,25 +21,60 @@ ; GFX9-LABEL: name: max-counter-lgkmcnt ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX9: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX9: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX9: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX9: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX9: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX9: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX9: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX9: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX9: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX9: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX9: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX9: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX9: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX9: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX9: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX9: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec ; GFX9: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec - ; GFX9-NOT: S_WAITCNT 53119 - ; GFX9-NEXT: S_WAITCNT 52863 - ; GFX9-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec - ; GFX9-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec - ; GFX9-NEXT: S_ENDPGM 0 + ; GFX9: S_WAITCNT 52863 + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX9: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX9: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 ; GFX10-LABEL: name: max-counter-lgkmcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX10: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX10: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX10: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX10: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX10: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX10: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX10: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX10: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX10: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX10: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX10: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX10: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX10: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX10: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX10: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX10: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec ; GFX10: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53631 - ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53375 - ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 53119 - ; GFX10-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec - ; GFX10-NEXT: S_WAITCNT 52863 - ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0 + ; GFX10: S_WAITCNT 53631 + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 53375 + ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 53119 + ; GFX10: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX10: S_WAITCNT 52863 + ; GFX10: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec @@ -72,84 +107,224 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 - ; GFX9_10-LABEL: name: max-counter-vmcnt - ; GFX9_10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec - ; GFX9-NOT: S_WAITCNT 53119 - ; GFX10-NOT: S_WAITCNT 65407 - ; GFX9-NEXT: S_WAITCNT 53118 - ; GFX10-NEXT: S_WAITCNT 65406 - ; GFX9_10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec - ; GFX9_10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec - ; GFX9_10-NEXT: S_ENDPGM 0 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec - $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec - $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, implicit $exec - $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, implicit $exec - $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, implicit $exec - $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, implicit $exec - $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, implicit $exec - $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, implicit $exec - $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, implicit $exec - $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, implicit $exec - $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, implicit $exec - $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, implicit $exec - $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, implicit $exec - $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, implicit $exec - $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, implicit $exec - $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, implicit $exec - $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, implicit $exec - $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, implicit $exec - $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, implicit $exec - $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, implicit $exec - $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, implicit $exec - $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, implicit $exec - $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, implicit $exec - $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, implicit $exec - $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, implicit $exec - $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, implicit $exec - $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, implicit $exec - $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, implicit $exec - $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, implicit $exec - $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, implicit $exec - $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, implicit $exec - $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, implicit $exec - $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, implicit $exec - $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, implicit $exec - $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, implicit $exec - $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, implicit $exec - $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, implicit $exec - $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, implicit $exec - $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, implicit $exec - $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, implicit $exec - $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, implicit $exec - $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, implicit $exec - $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, implicit $exec - $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, implicit $exec - $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, implicit $exec - $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, implicit $exec - $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, implicit $exec - $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, implicit $exec - $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, implicit $exec - $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, implicit $exec - $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, implicit $exec - $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, implicit $exec - $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, implicit $exec - $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, implicit $exec - $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, implicit $exec - $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, implicit $exec - $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, implicit $exec - $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, implicit $exec - $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, implicit $exec - $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, implicit $exec - $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, implicit $exec - $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, implicit $exec - $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, implicit $exec - $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, implicit $exec + ; GFX9-LABEL: name: max-counter-vmcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_WAITCNT 53118 + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX9: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX9: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-vmcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX10: S_WAITCNT 65406 + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX10: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX10: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec @@ -164,9 +339,31 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0, $vgpr1 - ; GFX9_10-LABEL: name: max-counter-expcnt - ; GFX9_10: EXP - ; GFX9_10-NOT: S_WAITCNT + ; GFX9-LABEL: name: max-counter-expcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX9: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: max-counter-expcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX10: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX10: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec Index: llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir @@ -32,6 +32,6 @@ $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec - IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) + IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 16) S_ENDPGM 0 ... Index: llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir @@ -11,10 +11,10 @@ ; GFX9-LABEL: name: buffer_buffer ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # Two tbuffer loads with overlapping outputs. No waitcnt required. @@ -27,10 +27,10 @@ ; GFX9-LABEL: name: tbuffer_tbuffer ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec - ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec - $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec - $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, 0, implicit $exec ... # Two gathers with overlapping outputs. (Note gathers can't be trimmed because @@ -44,10 +44,10 @@ ; GFX9-LABEL: name: gather_gather ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) ... # Image load vs image sample. Waitcnt required because they are not guaranteed @@ -62,9 +62,9 @@ ; GFX9-LABEL: name: nosampler_sampler ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9: S_WAITCNT 0 - ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) ; GFX9: S_WAITCNT 3952 - ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec - $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) - $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16) + ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16) + $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 16) + $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec :: (load 16) ... Index: llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.mir @@ -12,6 +12,6 @@ liveins: $sgpr0_sgpr1 $sgpr4 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 4, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1) + $vgpr0 = GLOBAL_ATOMIC_ADD_RTN $vgpr0_vgpr1, $vgpr2, 0, 1, 0, 0, implicit $exec :: (load store syncscope("agent") seq_cst 4, addrspace 1) S_CMP_LG_U32 killed $sgpr4, 0, implicit-def $scc ... Index: llvm/test/CodeGen/AMDGPU/waitcnt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/waitcnt.mir +++ llvm/test/CodeGen/AMDGPU/waitcnt.mir @@ -87,34 +87,34 @@ body: | bb.0: successors: %bb.1 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) - $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) + $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec S_BRANCH %bb.1 bb.1: successors: %bb.2 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.global16) $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec S_BRANCH %bb.2 bb.2: successors: %bb.3 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) - $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16) $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec S_BRANCH %bb.3 bb.3: successors: %bb.4 - $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) - $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) + $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec S_BRANCH %bb.4 bb.4: - $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec S_ENDPGM 0 ... @@ -135,11 +135,11 @@ body: | bb.0: successors: %bb.1 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr bb.1: $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec - FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... --- @@ -162,16 +162,16 @@ body: | bb.0: successors: %bb.2 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_BRANCH %bb.2 bb.1: - FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr8_vgpr9, $vgpr10, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 bb.2: $vgpr3_vgpr4 = V_LSHLREV_B64_e64 4, $vgpr7_vgpr8, implicit $exec - FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr3_vgpr4, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... @@ -186,9 +186,9 @@ body: | bb.0: liveins: $vgpr1_vgpr2 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_WAITCNT 0 - FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... @@ -208,12 +208,12 @@ body: | bb.0: liveins: $vgpr1_vgpr2 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr BUNDLE { S_NOP 0 S_NOP 0 } - FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... @@ -231,12 +231,12 @@ body: | bb.0: liveins: $vgpr1_vgpr2 - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr BUNDLE { S_NOP 0 S_WAITCNT 0 } - FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... @@ -245,9 +245,9 @@ # Def and use inside bundle # CHECK-LABEL: name: insert_in_bundle{{$}} # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { -# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: S_WAITCNT 112 -# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: } name: insert_in_bundle @@ -258,8 +258,8 @@ bb.0: liveins: $vgpr1_vgpr2 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, internal $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr } ... @@ -269,10 +269,10 @@ # CHECK-LABEL: name: exit_bundle{{$}} # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { -# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: } # CHECK-NEXT: S_WAITCNT 112 -# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr name: exit_bundle tracksRegLiveness: true @@ -282,10 +282,10 @@ bb.0: liveins: $vgpr1_vgpr2 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr } - FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ... @@ -295,11 +295,11 @@ # CHECK-LABEL: name: cross_bundle{{$}} # CHECK: BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { -# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: } # CHECK-NEXT: S_WAITCNT 112 # CHECK-NEXT: BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 { -# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr +# CHECK-NEXT: FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr # CHECK-NEXT: } name: cross_bundle @@ -310,10 +310,10 @@ bb.0: liveins: $vgpr1_vgpr2 BUNDLE implicit-def $vgpr0, implicit $vgpr1_vgpr2 { - $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr } BUNDLE implicit $vgpr0, implicit $vgpr1_vgpr2 { - FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr } ... @@ -328,7 +328,7 @@ body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4 - $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16 ... Index: llvm/test/CodeGen/AMDGPU/wqm.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/wqm.mir +++ llvm/test/CodeGen/AMDGPU/wqm.mir @@ -73,7 +73,7 @@ bb.1: S_CMP_LT_I32 0, %0:sgpr_32, implicit-def $scc - %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %3:vgpr_32, %13:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %3:vgpr_32, %13:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %12:vgpr_32 = V_ADD_CO_U32_e32 %3:vgpr_32, %3:vgpr_32, implicit-def $vcc, implicit $exec %5:sgpr_32 = S_CSELECT_B32 %2:sgpr_32, %1:sgpr_32, implicit $scc %11:vgpr_32 = V_ADD_CO_U32_e32 %5:sgpr_32, %12:vgpr_32, implicit-def $vcc, implicit $exec @@ -130,14 +130,14 @@ %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 %5:sgpr_128 = COPY %6 %7:sreg_32 = S_MOV_B32 0 - %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, 0, implicit $exec %16:vgpr_32 = COPY %8.sub1 %11:vgpr_32 = COPY %16 %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc %14:vgpr_32 = COPY %7 %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec - BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -162,7 +162,7 @@ %0:sgpr_32 = COPY $sgpr0 %4:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 %5:sreg_32 = S_MOV_B32 0 - %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, implicit $exec + %6:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %4, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec %8:sreg_64 = COPY $exec %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -172,7 +172,7 @@ early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec %14:vgpr_32 = COPY %13 - BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -210,20 +210,20 @@ undef %7.sub0:vreg_64 = COPY %2:vgpr_32 %7.sub1:vreg_64 = COPY %3:vgpr_32 - %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %7:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %7:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) S_CMP_EQ_U32 %8:sgpr_32, 0, implicit-def $scc undef %5.sub0:vreg_64 = nsz arcp nofpexcept V_ADD_F32_e64 0, %4.sub0:vreg_128, 0, %3:vgpr_32, 1, 0, implicit $mode, implicit $exec %5.sub1:vreg_64 = nsz arcp nofpexcept V_MUL_F32_e32 %2, %3, implicit $mode, implicit $exec %6:vgpr_32 = nsz arcp nofpexcept V_ADD_F32_e64 0, %2:vgpr_32, 0, %3:vgpr_32, 1, 0, implicit $mode, implicit $exec - %9:vreg_128 = IMAGE_SAMPLE_V4_V2 %5:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) + %9:vreg_128 = IMAGE_SAMPLE_V4_V2 %5:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 4, addrspace 4) S_CBRANCH_SCC0 %bb.2, implicit $scc bb.1: %10:sreg_32 = S_MOV_B32 0 - BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %101:sgpr_128, %10:sreg_32, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %101:sgpr_128, %10:sreg_32, 4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 bb.2: Index: llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll +++ llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll @@ -4,7 +4,7 @@ ; Test that custom pseudo source values can be round trip serialized through MIR. ; CHECK-LABEL: {{^}}name: shader -; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %17, %18, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4, align 1, addrspace 4) +; CHECK: %[[#]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed %17, %18, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource" + 4, align 1, addrspace 4) ; CHECK: IMAGE_STORE_V4_V3_nsa_gfx10 killed %[[#]], %[[#]], %[[#]], %[[#]], killed %[[#]], 15, 2, -1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "ImageResource") ; CHECK: DS_GWS_BARRIER %[[#]], 63, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource") define amdgpu_cs void @shader(i32 %arg0, i32 %arg1, <8 x i32> inreg %arg2, <4 x i32> inreg %arg3) { Index: llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir +++ llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir @@ -32,7 +32,7 @@ } ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test1 liveins: @@ -56,14 +56,14 @@ %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test2 liveins: @@ -87,14 +87,14 @@ %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test3 liveins: @@ -118,13 +118,13 @@ %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test4 liveins: @@ -148,8 +148,8 @@ %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... Index: llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/MIR/AMDGPU/load-store-opt-scc.mir @@ -0,0 +1,155 @@ +# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=si-load-store-opt -verify-machineinstrs -o - %s | FileCheck %s + +# The purpose of this test is to make sure we are combining relevant memory +# operations correctly with/without SCC bit. + +--- | + define amdgpu_kernel void @test1(i32 addrspace(1)* %out) { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void + } + + define amdgpu_kernel void @test2(i32 addrspace(1)* %out) { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void + } + + define amdgpu_kernel void @test3(i32 addrspace(1)* %out) { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void + } + define amdgpu_kernel void @test4(i32 addrspace(1)* %out) { + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1 + store i32 123, i32 addrspace(1)* %out.gep.1 + store i32 456, i32 addrspace(1)* %out + ret void + } +... + +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +--- +name: test1 +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '' } +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0_sgpr1 + + $vgpr0 = V_MOV_B32_e32 123, implicit $exec + $vgpr1 = V_MOV_B32_e32 456, implicit $exec + + $sgpr2 = S_MOV_B32 -1 + $sgpr3 = S_MOV_B32 61440 + + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %1, %2, %3 + + %5:vgpr_32 = COPY $vgpr0 + %6:vgpr_32 = COPY $vgpr1 + + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + + S_ENDPGM 0 +... + +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +--- +name: test2 +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '' } +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0_sgpr1 + + $vgpr0 = V_MOV_B32_e32 123, implicit $exec + $vgpr1 = V_MOV_B32_e32 456, implicit $exec + + $sgpr2 = S_MOV_B32 -1 + $sgpr3 = S_MOV_B32 61440 + + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %1, %2, %3 + + %5:vgpr_32 = COPY $vgpr0 + %6:vgpr_32 = COPY $vgpr1 + + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + + S_ENDPGM 0 +... + +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +--- +name: test3 +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '' } +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0_sgpr1 + + $vgpr0 = V_MOV_B32_e32 123, implicit $exec + $vgpr1 = V_MOV_B32_e32 456, implicit $exec + + $sgpr2 = S_MOV_B32 -1 + $sgpr3 = S_MOV_B32 61440 + + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %1, %2, %3 + + %5:vgpr_32 = COPY $vgpr0 + %6:vgpr_32 = COPY $vgpr1 + + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + + S_ENDPGM 0 +... + +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +--- +name: test4 +liveins: + - { reg: '$sgpr0_sgpr1', virtual-reg: '' } +body: | + bb.0 (%ir-block.0): + liveins: $sgpr0_sgpr1 + + $vgpr0 = V_MOV_B32_e32 123, implicit $exec + $vgpr1 = V_MOV_B32_e32 456, implicit $exec + + $sgpr2 = S_MOV_B32 -1 + $sgpr3 = S_MOV_B32 61440 + + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sgpr_64 = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %1, %2, %3 + + %5:vgpr_32 = COPY $vgpr0 + %6:vgpr_32 = COPY $vgpr1 + + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + + S_ENDPGM 0 +... Index: llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir +++ llvm/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir @@ -19,7 +19,7 @@ ; CHECK: %bb0_{{[0-9]+}}__2:vgpr_32 = COPY %bb0_{{[0-9]+}}__1 ; CHECK: %bb0_{{[0-9]+}}__1:vreg_64 = REG_SEQUENCE %bb0_{{[0-9]+}}__1, %subreg.sub0, %bb0_{{[0-9]+}}__1, %subreg.sub1 ; CHECK: %bb0_{{[0-9]+}}__1:sgpr_128 = REG_SEQUENCE %bb0_{{[0-9]+}}__1, %subreg.sub0, %bb0_{{[0-9]+}}__1, %subreg.sub1, %bb0_{{[0-9]+}}__1, %subreg.sub2, %bb0_{{[0-9]+}}__2, %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_ADDR64 %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_ADDR64 %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, %bb0_{{[0-9]+}}__1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: S_ENDPGM 0 %10:sreg_32_xm0 = S_MOV_B32 61440 %11:sreg_32_xm0 = S_MOV_B32 0 @@ -35,7 +35,7 @@ %vreg123_3:vgpr_32 = COPY %5 %16:sgpr_128 = REG_SEQUENCE killed %vreg123_0, %subreg.sub0, %vreg123_1, %subreg.sub1, %vreg123_2, %subreg.sub2, %vreg123_3, %subreg.sub3 - BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -12,7 +12,7 @@ # CHECK: isEntryFunction: true # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' # CHECK: frameOffsetReg: '$sgpr50' -# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) +# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register tracksRegLiveness: true machineFunctionInfo: @@ -24,6 +24,6 @@ body: | bb.0: - renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) S_ENDPGM 0 ... Index: llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir +++ llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir @@ -42,9 +42,9 @@ !0 = !{i32 1} # GCN-LABEL: name: syncscopes -# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out, addrspace 4) -# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out, addrspace 4) -# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out, addrspace 4) +# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out, addrspace 4) +# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out, addrspace 4) +# GCN: FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out, addrspace 4) ... --- name: syncscopes @@ -84,17 +84,17 @@ $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed $sgpr4_sgpr5, 40, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $sgpr0_sgpr1, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr6, implicit $exec, implicit $exec - FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out) + FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("agent") seq_cst 4 into %ir.agent_out) S_WAITCNT 112 $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $sgpr2_sgpr3, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit $exec - FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) + FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) S_WAITCNT 112 $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5 $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $sgpr4_sgpr5, implicit $exec $vgpr2 = V_MOV_B32_e32 killed $sgpr8, implicit $exec, implicit $exec - FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) + FLAT_STORE_DWORD killed $vgpr0_vgpr1, killed $vgpr2, 0, -1, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile non-temporal store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) S_ENDPGM 0 ... Index: llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir =================================================================== --- llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -52,7 +52,7 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -82,6 +82,6 @@ $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... Index: llvm/test/MC/AMDGPU/atomic-fadd-insts.s =================================================================== --- llvm/test/MC/AMDGPU/atomic-fadd-insts.s +++ llvm/test/MC/AMDGPU/atomic-fadd-insts.s @@ -41,7 +41,7 @@ // GFX908: encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03] buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 glc -// GFX908-ERR: error: invalid operand for instruction +// GFX908-ERR: error: operands are not valid for this GPU or mode buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:4095 slc // GFX908: encoding: [0xff,0x0f,0x36,0xe1,0x00,0x05,0x02,0x03] @@ -86,7 +86,7 @@ // GFX908: encoding: [0x07,0x00,0x38,0xe1,0x00,0x05,0x02,0x03] buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 glc -// GFX908-ERR: error: invalid operand for instruction +// GFX908-ERR: error: operands are not valid for this GPU or mode buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:4095 slc // GFX908: encoding: [0xff,0x0f,0x3a,0xe1,0x00,0x05,0x02,0x03] Index: llvm/test/MC/AMDGPU/dpp64.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/dpp64.s @@ -0,0 +1,58 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck %s --check-prefix=GFX90A +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck %s --check-prefix=GFX900 --implicit-check-not=error: + +// GFX90A: v_ceil_f64_dpp v[0:1], v[2:3] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_ceil_f64 v[0:1], v[2:3] row_newbcast:1 + +// GFX90A: v_fmac_f64_dpp v[0:1], v[2:3], v[4:5] row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff] +// GFX900: error: instruction not supported on this GPU +v_fmac_f64 v[0:1], v[2:3], v[4:5] row_newbcast:2 + +// GFX90A: v_cvt_f32_f64_dpp v5, v[0:1] row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff] +// GFX900: error: not a valid operand. +v_cvt_f32_f64 v5, v[0:1] row_newbcast:15 + +// GFX90A: v_cvt_i32_f64_dpp v5, v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_cvt_i32_f64 v5, v[0:1] row_newbcast:1 + +// GFX90A: v_cvt_u32_f64_dpp v5, v[0:1] row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff] +// GFX900: error: not a valid operand. +v_cvt_u32_f64 v5, v[0:1] row_newbcast:2 + +// GFX90A: v_floor_f64_dpp v[4:5], v[0:1] row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff] +// GFX900: error: not a valid operand. +v_floor_f64 v[4:5], v[0:1] row_newbcast:15 + +// GFX90A: v_fract_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_fract_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_frexp_exp_i32_f64_dpp v5, v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_frexp_exp_i32_f64 v5, v[0:1] row_newbcast:1 + +// GFX90A: v_frexp_mant_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_frexp_mant_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_rcp_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_rcp_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_rndne_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_rndne_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_rsq_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_rsq_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_sqrt_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_sqrt_f64 v[4:5], v[0:1] row_newbcast:1 + +// GFX90A: v_trunc_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff] +// GFX900: error: not a valid operand. +v_trunc_f64 v[4:5], v[0:1] row_newbcast:1 Index: llvm/test/MC/AMDGPU/gfx90a_asm_features.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/gfx90a_asm_features.s @@ -0,0 +1,1030 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX908,NOT-GFX90A --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX1010,NOT-GFX90A --implicit-check-not=error: %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp ; encoding: [0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c] +v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c] +v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[254:255], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[254:255], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18] +v_pk_mul_f32 v[4:5], s[2:3], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18] +v_pk_mul_f32 v[4:5], s[100:101], v[16:17] + +// GFX90A: v_pk_mul_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_pk_mul_f32 v[4:5], flat_scratch, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18] +v_pk_mul_f32 v[4:5], vcc, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18] +v_pk_mul_f32 v[4:5], exec, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[254:255] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18] +v_pk_mul_f32 v[4:5], v[8:9], s[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18] +v_pk_mul_f32 v[4:5], v[8:9], s[100:101] + +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_pk_mul_f32 v[4:5], v[8:9], flat_scratch + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18] +v_pk_mul_f32 v[4:5], v[8:9], vcc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18] +v_pk_mul_f32 v[4:5], v[8:9], exec + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18] +v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[254:255], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[254:255], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18] +v_pk_add_f32 v[4:5], s[2:3], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18] +v_pk_add_f32 v[4:5], s[100:101], v[16:17] + +// GFX90A: v_pk_add_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_pk_add_f32 v[4:5], flat_scratch, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18] +v_pk_add_f32 v[4:5], vcc, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18] +v_pk_add_f32 v[4:5], exec, v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[254:255] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18] +v_pk_add_f32 v[4:5], v[8:9], s[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18] +v_pk_add_f32 v[4:5], v[8:9], s[100:101] + +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_pk_add_f32 v[4:5], v[8:9], flat_scratch + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18] +v_pk_add_f32 v[4:5], v[8:9], vcc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18] +v_pk_add_f32 v[4:5], v[8:9], exec + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18] +v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18] +v_pk_mov_b32 v[0:1], v[2:3], v[4:5] + +// GFX90A: v_pk_mov_b32 v[0:1], flat_scratch, v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_pk_mov_b32 v[0:1], flat_scratch, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], vcc ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18] +v_pk_mov_b32 v[0:1], v[2:3], vcc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], s[0:1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18] +v_pk_mov_b32 v[0:1], v[2:3], s[0:1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel_hi:[0,1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x05,0x02,0x10] +v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel_hi:[0,1] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0] ; encoding: [0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18] +v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1] ; encoding: [0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18] +v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1] + +// GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 scc ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80] +// NOT-GFX1010: error: not a valid operand. +// NOT-GFX908: error: failed parsing operand. +tbuffer_load_format_xyzw v[4:7], off, s[0:3], dfmt:1, nfmt:0, 0 scc + +// GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc scc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80] +// NOT-GFX1010: error: not a valid operand. +// NOT-GFX908: error: failed parsing operand. +tbuffer_load_format_xyzw v[4:7], off, s[0:3], dfmt:1, nfmt:0, 0 glc scc + +// NOT-GFX90A: error: failed parsing operand +// GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03] +buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc + +// NOT-GFX90A: error: failed parsing operand +// GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03] +buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_wbl2 ; encoding: [0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00] +buffer_wbl2 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_invl2 ; encoding: [0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00] +buffer_invl2 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03] +buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03] +buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c] +buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80] +buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1] +buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03] +buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03] +buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c] +buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80] +buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1] +buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03] +buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03] +buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c] +buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80] +buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1] +buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00] +ds_add_f64 v1, v[2:3] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00] +ds_add_f64 v255, v[2:3] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00] +ds_add_f64 v1, v[254:255] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +ds_add_f64 v1, v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +ds_add_f64 v1, v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +ds_add_f64 v1, v[2:3] offset:4 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_f64 v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00] +ds_add_f64 v1, v[2:3] offset:65535 gds + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe] +ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04] +ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds + +// NOT-GFX90A: error: failed parsing operand +// GFX90A: flat_load_dword v0, v[0:1] scc ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00] +flat_load_dword v0, v[0:1] scc + +// NOT-GFX90A: error: failed parsing operand +// GFX90A: flat_load_dword v0, v[0:1] glc scc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00] +flat_load_dword v0, v[0:1] glc scc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00] +flat_atomic_add_f64 v[254:255], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[254:255] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc ; encoding: [0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00] +flat_atomic_min_f64 v[254:255], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[254:255] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00] +flat_atomic_max_f64 v[254:255], v[2:3] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[254:255] offset:4095 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] offset:7 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: global_atomic_add_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00] +global_atomic_add_f64 v[0:1], v[2:3], off + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: global_atomic_min_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00] +global_atomic_min_f64 v[0:1], v[2:3], off + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: global_atomic_max_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00] +global_atomic_max_f64 v[0:1], v[2:3], off + +// NOT-GFX90A: error: failed parsing operand +// GFX90A: image_load v[0:4], v2, s[0:7] dmask:0xf unorm scc ; encoding: [0x80,0x1f,0x00,0xf0,0x02,0x00,0x00,0x00] +image_load v[0:4], v2, s[0:7] dmask:0xf unorm scc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x08] +v_fmac_f64_e32 v[4:5], v[2:3], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x09] +v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x08] +v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] + +// GFX90A: v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5] ; encoding: [0x66,0x08,0x08,0x08] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], vcc, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], exec, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], 0, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], -1, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], 0.5, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x08] +v_fmac_f64_e32 v[4:5], -4.0, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf] +v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f] +v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x08] +v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[254:255], v[2:3], v[8:9] ; encoding: [0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[254:255], v[2:3], v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[254:255], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], v[254:255], v[8:9] + +// GFX90A: v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], vcc, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], vcc, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], exec, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], exec, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], 0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], 0, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], -1, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], -1, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], 0.5, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], 0.5, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], -4.0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00] +v_fmac_f64_e64 v[4:5], -4.0, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[254:255] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], v[254:255] + +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: instruction not supported on this GPU +v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], vcc + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], exec + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], 0 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], -1 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], 0.5 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], -4.0 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20] +v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40] +v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60] +v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00] +v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08] +v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10] +v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 + +// NOT-GFX90A: error: instruction not supported on this GPU +// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18] +v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, v1, v2 + +// GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v255, v1, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, v255, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, s1, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, s101, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, vcc_lo, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, vcc_hi, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, m0, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, exec_lo, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, exec_hi, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, 0, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, -1, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, 0.5, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00] +v_mul_legacy_f32_e64 v5, -4.0, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00] +v_mul_legacy_f32_e64 v5, v1, v255 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, s2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, s101 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, vcc_lo + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, vcc_hi + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, m0 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, exec_lo + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00] +v_mul_legacy_f32_e64 v5, v1, exec_hi + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00] +v_mul_legacy_f32_e64 v5, v1, 0 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00] +v_mul_legacy_f32_e64 v5, v1, -1 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00] +v_mul_legacy_f32_e64 v5, v1, 0.5 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00] +v_mul_legacy_f32_e64 v5, v1, -4.0 + +// GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20] +v_mul_legacy_f32_e64 v5, -v1, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40] +v_mul_legacy_f32_e64 v5, v1, -v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60] +v_mul_legacy_f32_e64 v5, -v1, -v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, |v1|, v2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, v1, |v2| + +// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, |v1|, |v2| + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00] +v_mul_legacy_f32_e64 v5, v1, v2 clamp + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08] +v_mul_legacy_f32_e64 v5, v1, v2 mul:2 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10] +v_mul_legacy_f32_e64 v5, v1, v2 mul:4 + +// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18] +v_mul_legacy_f32_e64 v5, v1, v2 div:2 + +// GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff] +// NOT-GFX90A: error: not a valid operand. +v_xor_b32 v6, v29, v27 row_newbcast:0 + +// GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:7 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff] +// NOT-GFX90A: error: not a valid operand. +v_xor_b32 v6, v29, v27 row_newbcast:7 + +// GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff] +// NOT-GFX90A: error: not a valid operand. +v_xor_b32 v6, v29, v27 row_newbcast:15 + +// GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: operands are not valid for this GPU or mode +buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc + +// GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: operands are not valid for this GPU or mode +buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc + +// GFX90A: buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: operands are not valid for this GPU or mode +buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc + +// GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: operands are not valid for this GPU or mode +global_atomic_add_f32 v0, v[0:1], v2, off glc + +// GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00] +// NOT-GFX1010: error: instruction not supported on this GPU +// NOT-GFX908: error: operands are not valid for this GPU or mode +global_atomic_pk_add_f16 v0, v[0:1], v2, off glc + +// GFX90A: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc + +// GFX90A: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc + +// GFX90A: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc + +// GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc + +// GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc + +// GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00] +// NOT-GFX90A: error: instruction not supported on this GPU +flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc Index: llvm/test/MC/AMDGPU/gfx90a_err.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/gfx90a_err.s @@ -0,0 +1,196 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A --implicit-check-not=error: %s + +ds_add_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_add_src2_f32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_sub_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_rsub_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_inc_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_dec_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_i32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_i32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_u32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_and_src2_b32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_or_src2_b32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_xor_src2_b32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_f32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_f32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_add_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_sub_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_rsub_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_inc_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_dec_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_i64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_i64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_u64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_and_src2_b64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_or_src2_b64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_xor_src2_b64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_min_src2_f64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_max_src2_f64 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_write_src2_b32 v1 +// GFX90A: error: instruction not supported on this GPU + +ds_write_src2_b64 v1 +// GFX90A: error: instruction not supported on this GPU + +image_gather4 v[5:8], v1, s[8:15], s[12:15] +// GFX90A: error: instruction not supported on this GPU + +image_get_lod v5, v1, s[8:15], s[12:15] +// GFX90A: error: instruction not supported on this GPU + +v_mul_legacy_f32_e32 v5, v1, v2 +// GFX90A: error: e32 variant of this instruction is not supported + +v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +// GFX90A: error: sdwa variant of this instruction is not supported + +v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// GFX90A: error: dpp variant of this instruction is not supported + +v_interp_p1_f32 v5, v1, attr0.x +// GFX90A: error: instruction not supported on this GPU + +v_interp_p1_f32_e64 v5, v2, attr0.x +// GFX90A: error: instruction not supported on this GPU + +v_interp_p2_f32 v5, v1, attr0.x +// GFX90A: error: instruction not supported on this GPU + +v_interp_mov_f32 v5, p10, attr0.x +// GFX90A: error: instruction not supported on this GPU + +v_interp_p1ll_f16 v5, v2, attr0.x +// GFX90A: error: instruction not supported on this GPU + +v_interp_p1lv_f16 v5, v2, attr0.x, v3 +// GFX90A: error: instruction not supported on this GPU + +v_interp_p2_legacy_f16 v5, v2, attr0.x, v3 +// GFX90A: error: instruction not supported on this GPU + +v_interp_p2_f16 v5, v2, attr0.x, v3 +// GFX90A: error: instruction not supported on this GPU + +v_mov_b32_dpp v5, v1 row_share:1 row_mask:0x0 bank_mask:0x0 +// GFX90A: error: not a valid operand + +v_ceil_f64_dpp v[0:1], v[2:3] quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf +// GFX90A: error: not a valid operand. + +v_ceil_f64_dpp v[0:1], v[2:3] row_shl:1 row_mask:0xf bank_mask:0xf +// GFX90A: error: not a valid operand. + +v_ceil_f64_dpp v[0:1], v[2:3] wave_ror:1 row_mask:0xf bank_mask:0xf +// GFX90A: error: not a valid operand. + +v_ceil_f64_dpp v[0:1], v[2:3] row_share:1 row_mask:0xf bank_mask:0xf +// GFX90A: error: not a valid operand. + +flat_atomic_add v2, v[2:3], a2 glc +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +flat_atomic_add a2, v[2:3], v2 glc +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +tbuffer_store_format_xyzw v[0:3], off, s[4:7], dfmt:15, nfmt:2, s1 tfe +// GFX90A: error: operands are not valid for this GPU or mode + +buffer_store_dwordx4 v[0:3], off, s[12:15], s4 offset:4095 glc tfe +// GFX90A: error: operands are not valid for this GPU or mode + +ds_write2_b64 v1, a[4:5], v[2:3] offset1:255 +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +ds_write2_b64 v1, v[4:5], a[2:3] offset1:255 +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +ds_write2_b64 v1, a[4:5], v[2:3] offset1:255 gds +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +ds_write2_b64 v1, v[4:5], a[2:3] offset1:255 gds +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +ds_wrxchg2st64_rtn_b32 v[6:7], v1, a2, a3 offset0:127 +// GFX90A: error: invalid register class: data and dst should be all VGPR or AGPR + +image_load v[0:4], v2, s[0:7] dmask:0xf unorm tfe +// GFX90A: error: operands are not valid for this GPU or mode + +image_sample_lz v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU + +image_sample_d v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU + +image_sample_o v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU + +image_sample_cl v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU + +image_sample_cd v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU + +image_sample_b v[0:3], v[0:1], s[4:11], s[16:19] dmask:0xf +// GFX90A: error: instruction not supported on this GPU Index: llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s @@ -0,0 +1,11194 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx908 %s 2>&1 | FileCheck --check-prefix=NOT-GFX90A --implicit-check-not=error: %s +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s + +// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a255, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[254:255] offset:4095 + +// GFX90A: flat_load_ubyte a5, v[2:3] ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] + +// GFX90A: flat_load_ubyte a5, v[2:3] ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] + +// GFX90A: flat_load_ubyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] offset:7 + +// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a255, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[254:255] offset:4095 + +// GFX90A: flat_load_sbyte a5, v[2:3] ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] + +// GFX90A: flat_load_sbyte a5, v[2:3] ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] + +// GFX90A: flat_load_sbyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] offset:7 + +// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] offset:4095 + +// GFX90A: flat_load_ushort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a255, v[2:3] offset:4095 + +// GFX90A: flat_load_ushort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[254:255] offset:4095 + +// GFX90A: flat_load_ushort a5, v[2:3] ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] + +// GFX90A: flat_load_ushort a5, v[2:3] ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] + +// GFX90A: flat_load_ushort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] offset:7 + +// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_ushort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ushort a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] offset:4095 + +// GFX90A: flat_load_sshort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a255, v[2:3] offset:4095 + +// GFX90A: flat_load_sshort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[254:255] offset:4095 + +// GFX90A: flat_load_sshort a5, v[2:3] ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] + +// GFX90A: flat_load_sshort a5, v[2:3] ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] + +// GFX90A: flat_load_sshort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] offset:7 + +// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_sshort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sshort a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_dword a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] offset:4095 + +// GFX90A: flat_load_dword a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a255, v[2:3] offset:4095 + +// GFX90A: flat_load_dword a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[254:255] offset:4095 + +// GFX90A: flat_load_dword a5, v[2:3] ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] + +// GFX90A: flat_load_dword a5, v[2:3] ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] + +// GFX90A: flat_load_dword a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] offset:7 + +// GFX90A: flat_load_dword a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_dword a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dword a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx2 a[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[254:255], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx2 a[6:7], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[254:255] offset:4095 + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:7 ; encoding: [0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] offset:7 + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc + +// GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx3 a[252:254], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[252:254], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx3 a[6:8], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[254:255] offset:4095 + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:7 ; encoding: [0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] offset:7 + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc + +// GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx4 a[252:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[252:255], v[2:3] offset:4095 + +// GFX90A: flat_load_dwordx4 a[6:9], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[254:255] offset:4095 + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:7 ; encoding: [0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] offset:7 + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc + +// GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc + +// GFX90A: flat_store_byte v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 offset:4095 + +// GFX90A: flat_store_byte v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[254:255], a2 offset:4095 + +// GFX90A: flat_store_byte v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a255 offset:4095 + +// GFX90A: flat_store_byte v[2:3], a2 ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 + +// GFX90A: flat_store_byte v[2:3], a2 ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 + +// GFX90A: flat_store_byte v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 offset:7 + +// GFX90A: flat_store_byte v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 offset:4095 glc + +// GFX90A: flat_store_byte v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte v[2:3], a2 offset:4095 slc + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 offset:4095 + +// GFX90A: flat_store_byte_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[254:255], a2 offset:4095 + +// GFX90A: flat_store_byte_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a255 offset:4095 + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 offset:7 + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc + +// GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc + +// GFX90A: flat_store_short v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 offset:4095 + +// GFX90A: flat_store_short v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[254:255], a2 offset:4095 + +// GFX90A: flat_store_short v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a255 offset:4095 + +// GFX90A: flat_store_short v[2:3], a2 ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 + +// GFX90A: flat_store_short v[2:3], a2 ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 + +// GFX90A: flat_store_short v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 offset:7 + +// GFX90A: flat_store_short v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 offset:4095 glc + +// GFX90A: flat_store_short v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short v[2:3], a2 offset:4095 slc + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 offset:4095 + +// GFX90A: flat_store_short_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[254:255], a2 offset:4095 + +// GFX90A: flat_store_short_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a255 offset:4095 + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 offset:7 + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 offset:4095 glc + +// GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_short_d16_hi v[2:3], a2 offset:4095 slc + +// GFX90A: flat_store_dword v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 offset:4095 + +// GFX90A: flat_store_dword v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[254:255], a2 offset:4095 + +// GFX90A: flat_store_dword v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a255 offset:4095 + +// GFX90A: flat_store_dword v[2:3], a2 ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 + +// GFX90A: flat_store_dword v[2:3], a2 ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 + +// GFX90A: flat_store_dword v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 offset:7 + +// GFX90A: flat_store_dword v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 offset:4095 glc + +// GFX90A: flat_store_dword v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dword v[2:3], a2 offset:4095 slc + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_store_dwordx2 v[254:255], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[254:255], a[2:3] offset:4095 + +// GFX90A: flat_store_dwordx2 v[2:3], a[254:255] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[254:255] offset:4095 + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:7 ; encoding: [0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] offset:7 + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] offset:4095 + +// GFX90A: flat_store_dwordx3 v[254:255], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[254:255], a[2:4] offset:4095 + +// GFX90A: flat_store_dwordx3 v[2:3], a[252:254] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[252:254] offset:4095 + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:7 ; encoding: [0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] offset:7 + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc ; encoding: [0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc + +// GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] offset:4095 + +// GFX90A: flat_store_dwordx4 v[254:255], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[254:255], a[2:5] offset:4095 + +// GFX90A: flat_store_dwordx4 v[2:3], a[252:255] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[252:255] offset:4095 + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:7 ; encoding: [0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] offset:7 + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc + +// GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a255, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[254:255] offset:4095 + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] offset:7 + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a255, v[2:3] offset:4095 + +// GFX90A: flat_load_ubyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[254:255] offset:4095 + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] offset:7 + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a255, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[254:255] offset:4095 + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] offset:7 + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a255, v[2:3] offset:4095 + +// GFX90A: flat_load_sbyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[254:255] offset:4095 + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] offset:7 + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] offset:4095 + +// GFX90A: flat_load_short_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a255, v[2:3] offset:4095 + +// GFX90A: flat_load_short_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[254:255] offset:4095 + +// GFX90A: flat_load_short_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] + +// GFX90A: flat_load_short_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] + +// GFX90A: flat_load_short_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] offset:7 + +// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16 a5, v[2:3] offset:4095 slc + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] offset:4095 + +// GFX90A: flat_load_short_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a255, v[2:3] offset:4095 + +// GFX90A: flat_load_short_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[254:255] offset:4095 + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] offset:7 + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] offset:4095 glc + +// GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_load_short_d16_hi a5, v[2:3] offset:4095 slc + +// GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_swap a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_add a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_sub a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smin a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umin a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smax a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umax a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_and a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_or a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_xor a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_inc a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_dec a0, v[2:3], a2 offset:4095 glc + +// GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc + +// GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc + +// GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_swap v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_cmpswap v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_cmpswap v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_add v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_add v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_sub v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_sub v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_smin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smin v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_umin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umin v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_smax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smax v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_umax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umax v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_and v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_and v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_or v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_or v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_xor v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_xor v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_inc v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_inc v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_dec v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_dec v[2:3], a2 offset:4095 + +// GFX90A: flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095 + +// GFX90A: flat_atomic_add_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_add_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_and_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_and_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_or_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_or_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095 + +// GFX90A: global_load_ubyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte a5, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte a255, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte a5, v[2:3], off ; encoding: [0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte a5, v[2:3], off + +// GFX90A: global_load_sbyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte a5, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte a255, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte a5, v[2:3], off ; encoding: [0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte a5, v[2:3], off + +// GFX90A: global_load_ushort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ushort a5, v[2:3], off offset:-1 + +// GFX90A: global_load_ushort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ushort a255, v[2:3], off offset:-1 + +// GFX90A: global_load_ushort a5, v[2:3], off ; encoding: [0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ushort a5, v[2:3], off + +// GFX90A: global_load_sshort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sshort a5, v[2:3], off offset:-1 + +// GFX90A: global_load_sshort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sshort a255, v[2:3], off offset:-1 + +// GFX90A: global_load_sshort a5, v[2:3], off ; encoding: [0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sshort a5, v[2:3], off + +// GFX90A: global_load_dword a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dword a5, v[2:3], off offset:-1 + +// GFX90A: global_load_dword a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dword a255, v[2:3], off offset:-1 + +// GFX90A: global_load_dword a5, v[2:3], off ; encoding: [0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dword a5, v[2:3], off + +// GFX90A: global_load_dwordx2 a[6:7], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx2 a[6:7], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx2 a[254:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx2 a[254:255], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx2 a[6:7], v[2:3], off ; encoding: [0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx2 a[6:7], v[2:3], off + +// GFX90A: global_load_dwordx3 a[6:8], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx3 a[6:8], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx3 a[252:254], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx3 a[252:254], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx3 a[6:8], v[2:3], off ; encoding: [0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx3 a[6:8], v[2:3], off + +// GFX90A: global_load_dwordx4 a[6:9], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx4 a[6:9], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx4 a[252:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx4 a[252:255], v[2:3], off offset:-1 + +// GFX90A: global_load_dwordx4 a[6:9], v[2:3], off ; encoding: [0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_dwordx4 a[6:9], v[2:3], off + +// GFX90A: global_store_byte v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte v[2:3], a2, off offset:-1 + +// GFX90A: global_store_byte v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte v[2:3], a255, off offset:-1 + +// GFX90A: global_store_byte v[2:3], a2, off ; encoding: [0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte v[2:3], a2, off + +// GFX90A: global_store_byte_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte_d16_hi v[2:3], a2, off offset:-1 + +// GFX90A: global_store_byte_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte_d16_hi v[2:3], a255, off offset:-1 + +// GFX90A: global_store_byte_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_byte_d16_hi v[2:3], a2, off + +// GFX90A: global_store_short v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short v[2:3], a2, off offset:-1 + +// GFX90A: global_store_short v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short v[2:3], a255, off offset:-1 + +// GFX90A: global_store_short v[2:3], a2, off ; encoding: [0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short v[2:3], a2, off + +// GFX90A: global_store_short_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short_d16_hi v[2:3], a2, off offset:-1 + +// GFX90A: global_store_short_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short_d16_hi v[2:3], a255, off offset:-1 + +// GFX90A: global_store_short_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_short_d16_hi v[2:3], a2, off + +// GFX90A: global_store_dword v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dword v[2:3], a2, off offset:-1 + +// GFX90A: global_store_dword v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dword v[2:3], a255, off offset:-1 + +// GFX90A: global_store_dword v[2:3], a2, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dword v[2:3], a2, off + +// GFX90A: global_store_dwordx2 v[2:3], a[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx2 v[2:3], a[2:3], off offset:-1 + +// GFX90A: global_store_dwordx2 v[2:3], a[254:255], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx2 v[2:3], a[254:255], off offset:-1 + +// GFX90A: global_store_dwordx2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx2 v[2:3], a[2:3], off + +// GFX90A: global_store_dwordx3 v[2:3], a[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx3 v[2:3], a[2:4], off offset:-1 + +// GFX90A: global_store_dwordx3 v[2:3], a[252:254], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx3 v[2:3], a[252:254], off offset:-1 + +// GFX90A: global_store_dwordx3 v[2:3], a[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx3 v[2:3], a[2:4], off + +// GFX90A: global_store_dwordx4 v[2:3], a[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx4 v[2:3], a[2:5], off offset:-1 + +// GFX90A: global_store_dwordx4 v[2:3], a[252:255], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx4 v[2:3], a[252:255], off offset:-1 + +// GFX90A: global_store_dwordx4 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_store_dwordx4 v[2:3], a[2:5], off + +// GFX90A: global_load_ubyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16 a5, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16 a255, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16 a5, v[2:3], off + +// GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16_hi a5, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16_hi a255, v[2:3], off offset:-1 + +// GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_ubyte_d16_hi a5, v[2:3], off + +// GFX90A: global_load_sbyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16 a5, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16 a255, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16 a5, v[2:3], off + +// GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16_hi a5, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16_hi a255, v[2:3], off offset:-1 + +// GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_sbyte_d16_hi a5, v[2:3], off + +// GFX90A: global_load_short_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16 a5, v[2:3], off offset:-1 + +// GFX90A: global_load_short_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16 a255, v[2:3], off offset:-1 + +// GFX90A: global_load_short_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16 a5, v[2:3], off + +// GFX90A: global_load_short_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16_hi a5, v[2:3], off offset:-1 + +// GFX90A: global_load_short_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16_hi a255, v[2:3], off offset:-1 + +// GFX90A: global_load_short_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_load_short_d16_hi a5, v[2:3], off + +// GFX90A: global_atomic_swap a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_swap a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_cmpswap a1, v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_cmpswap a1, v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_add a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_add a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_sub a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_sub a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_smin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smin a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_umin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umin a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_smax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smax a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_umax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umax a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_and a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_and a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_or a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_or a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_xor a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_xor a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_inc a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_inc a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_dec a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_dec a1, v[2:3], a2, off glc + +// GFX90A: global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc ; encoding: [0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc + +// GFX90A: global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc + +// GFX90A: global_atomic_swap v[2:3], a2, off ; encoding: [0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_swap v[2:3], a2, off + +// GFX90A: global_atomic_cmpswap v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_cmpswap v[2:3], a[2:3], off + +// GFX90A: global_atomic_add v[2:3], a2, off ; encoding: [0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_add v[2:3], a2, off + +// GFX90A: global_atomic_sub v[2:3], a2, off ; encoding: [0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_sub v[2:3], a2, off + +// GFX90A: global_atomic_smin v[2:3], a2, off ; encoding: [0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smin v[2:3], a2, off + +// GFX90A: global_atomic_umin v[2:3], a2, off ; encoding: [0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umin v[2:3], a2, off + +// GFX90A: global_atomic_smax v[2:3], a2, off ; encoding: [0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smax v[2:3], a2, off + +// GFX90A: global_atomic_umax v[2:3], a2, off ; encoding: [0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umax v[2:3], a2, off + +// GFX90A: global_atomic_and v[2:3], a2, off ; encoding: [0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_and v[2:3], a2, off + +// GFX90A: global_atomic_or v[2:3], a2, off ; encoding: [0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_or v[2:3], a2, off + +// GFX90A: global_atomic_xor v[2:3], a2, off ; encoding: [0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_xor v[2:3], a2, off + +// GFX90A: global_atomic_inc v[2:3], a2, off ; encoding: [0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_inc v[2:3], a2, off + +// GFX90A: global_atomic_dec v[2:3], a2, off ; encoding: [0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_dec v[2:3], a2, off + +// GFX90A: global_atomic_swap_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_swap_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_cmpswap_x2 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_cmpswap_x2 v[2:3], a[2:5], off + +// GFX90A: global_atomic_add_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_add_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_sub_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_sub_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_smin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smin_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_umin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umin_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_smax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_smax_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_umax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_umax_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_and_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_and_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_or_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_or_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_xor_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_xor_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_inc_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_inc_x2 v[2:3], a[2:3], off + +// GFX90A: global_atomic_dec_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +global_atomic_dec_x2 v[2:3], a[2:3], off + +// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a255, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s101 offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_ubyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, v0, off offset:-1 + +// GFX90A: scratch_load_ubyte a5, off, s2 ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 + +// GFX90A: scratch_load_ubyte a5, off, s2 ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 + +// GFX90A: scratch_load_ubyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 offset:4095 + +// GFX90A: scratch_load_ubyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a255, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s101 offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_sbyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, v0, off offset:-1 + +// GFX90A: scratch_load_sbyte a5, off, s2 ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 + +// GFX90A: scratch_load_sbyte a5, off, s2 ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 + +// GFX90A: scratch_load_sbyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 offset:4095 + +// GFX90A: scratch_load_sbyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 offset:-1 + +// GFX90A: scratch_load_ushort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a255, off, s2 offset:-1 + +// GFX90A: scratch_load_ushort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s101 offset:-1 + +// GFX90A: scratch_load_ushort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_ushort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_ushort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_ushort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_ushort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, v0, off offset:-1 + +// GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 + +// GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 + +// GFX90A: scratch_load_ushort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 offset:4095 + +// GFX90A: scratch_load_ushort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_ushort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ushort a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 offset:-1 + +// GFX90A: scratch_load_sshort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a255, off, s2 offset:-1 + +// GFX90A: scratch_load_sshort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s101 offset:-1 + +// GFX90A: scratch_load_sshort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_sshort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_sshort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_sshort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_sshort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, v0, off offset:-1 + +// GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 + +// GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 + +// GFX90A: scratch_load_sshort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 offset:4095 + +// GFX90A: scratch_load_sshort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_sshort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sshort a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_dword a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 offset:-1 + +// GFX90A: scratch_load_dword a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a255, off, s2 offset:-1 + +// GFX90A: scratch_load_dword a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s101 offset:-1 + +// GFX90A: scratch_load_dword a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_dword a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_dword a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_dword a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_dword a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, v0, off offset:-1 + +// GFX90A: scratch_load_dword a5, off, s2 ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 + +// GFX90A: scratch_load_dword a5, off, s2 ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 + +// GFX90A: scratch_load_dword a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 offset:4095 + +// GFX90A: scratch_load_dword a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_dword a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_dword a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dword a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx2 a[254:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[254:255], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s101 offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], v0, off offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], v0, off offset:-1 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 offset:4095 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 offset:-4096 + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc + +// GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx3 a[252:254], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[252:254], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s101 offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], v0, off offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], v0, off offset:-1 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 offset:4095 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 offset:-4096 + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc + +// GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx4 a[252:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[252:255], off, s2 offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s101 offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], v0, off offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], v0, off offset:-1 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 offset:4095 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 offset:-4096 + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc + +// GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc + +// GFX90A: scratch_store_byte off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 offset:-1 + +// GFX90A: scratch_store_byte off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a255, s3 offset:-1 + +// GFX90A: scratch_store_byte off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s101 offset:-1 + +// GFX90A: scratch_store_byte off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_byte off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_byte off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, vcc_lo offset:-1 + +// GFX90A: scratch_store_byte off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, vcc_hi offset:-1 + +// GFX90A: scratch_store_byte v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte v0, a2, off offset:-1 + +// GFX90A: scratch_store_byte off, a2, s3 ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 + +// GFX90A: scratch_store_byte off, a2, s3 ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 + +// GFX90A: scratch_store_byte off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 offset:4095 + +// GFX90A: scratch_store_byte off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 offset:-4096 + +// GFX90A: scratch_store_byte off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 offset:-1 glc + +// GFX90A: scratch_store_byte off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte off, a2, s3 offset:-1 slc + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a255, s3 offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s101 offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1 + +// GFX90A: scratch_store_byte_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi v0, a2, off offset:-1 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 offset:4095 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 offset:-4096 + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc + +// GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc + +// GFX90A: scratch_store_short off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 offset:-1 + +// GFX90A: scratch_store_short off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a255, s3 offset:-1 + +// GFX90A: scratch_store_short off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s101 offset:-1 + +// GFX90A: scratch_store_short off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_short off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_short off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, vcc_lo offset:-1 + +// GFX90A: scratch_store_short off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, vcc_hi offset:-1 + +// GFX90A: scratch_store_short v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short v0, a2, off offset:-1 + +// GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 + +// GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 + +// GFX90A: scratch_store_short off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 offset:4095 + +// GFX90A: scratch_store_short off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 offset:-4096 + +// GFX90A: scratch_store_short off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 offset:-1 glc + +// GFX90A: scratch_store_short off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short off, a2, s3 offset:-1 slc + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a255, s3 offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s101 offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, vcc_lo offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, vcc_hi offset:-1 + +// GFX90A: scratch_store_short_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi v0, a2, off offset:-1 + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 offset:4095 + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 offset:-4096 + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 offset:-1 glc + +// GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_short_d16_hi off, a2, s3 offset:-1 slc + +// GFX90A: scratch_store_dword off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 offset:-1 + +// GFX90A: scratch_store_dword off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a255, s3 offset:-1 + +// GFX90A: scratch_store_dword off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s101 offset:-1 + +// GFX90A: scratch_store_dword off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_dword off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_dword off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, vcc_lo offset:-1 + +// GFX90A: scratch_store_dword off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, vcc_hi offset:-1 + +// GFX90A: scratch_store_dword v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword v0, a2, off offset:-1 + +// GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 + +// GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 + +// GFX90A: scratch_store_dword off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 offset:4095 + +// GFX90A: scratch_store_dword off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 offset:-4096 + +// GFX90A: scratch_store_dword off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 offset:-1 glc + +// GFX90A: scratch_store_dword off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dword off, a2, s3 offset:-1 slc + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[254:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[254:255], s3 offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s101 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s101 offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1 + +// GFX90A: scratch_store_dwordx2 v0, a[2:3], off offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 v0, a[2:3], off offset:-1 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:4095 ; encoding: [0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 offset:4095 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-4096 ; encoding: [0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 offset:-4096 + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc + +// GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[252:254], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[252:254], s3 offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s101 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s101 offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1 + +// GFX90A: scratch_store_dwordx3 v0, a[2:4], off offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 v0, a[2:4], off offset:-1 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:4095 ; encoding: [0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 offset:4095 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-4096 ; encoding: [0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 offset:-4096 + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc + +// GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[252:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[252:255], s3 offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s101 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s101 offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1 + +// GFX90A: scratch_store_dwordx4 v0, a[2:5], off offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 v0, a[2:5], off offset:-1 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:4095 ; encoding: [0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 offset:4095 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-4096 ; encoding: [0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 offset:-4096 + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc + +// GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a255, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s101 offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, v0, off offset:-1 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 offset:4095 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a255, off, s2 offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s101 offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, v0, off offset:-1 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 offset:4095 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a255, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s101 offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, v0, off offset:-1 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 offset:4095 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a255, off, s2 offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s101 offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, v0, off offset:-1 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 offset:4095 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 offset:-1 + +// GFX90A: scratch_load_short_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a255, off, s2 offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s101 offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_short_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, v0, off offset:-1 + +// GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 + +// GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 + +// GFX90A: scratch_load_short_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 offset:4095 + +// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16 a5, off, s2 offset:-1 slc + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 offset:-1 + +// GFX90A: scratch_load_short_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a255, off, s2 offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s101 offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, vcc_lo offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, vcc_hi offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, v0, off offset:-1 + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 offset:4095 + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 offset:-4096 + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 offset:-1 glc + +// GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +scratch_load_short_d16_hi a5, off, s2 offset:-1 slc + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ushort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sshort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dword a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_byte a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_short a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_short a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dword a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a255, off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7 + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc + +// GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_swap a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_add a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_sub a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_and a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_or a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_xor a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_inc a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_dec a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a255, off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7 + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc + +// GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc + +// GFX90A: tbuffer_load_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_load_format_x a1, off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_load_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_load_format_xy a[2:3], off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_load_format_xyz a[2:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_load_format_xyz a[2:4], off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_load_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_load_format_xyzw a[2:5], off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_store_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_x a1, off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_store_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xy a[2:3], off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_store_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xyzw a[2:5], off, s[4:7], dfmt:15, nfmt:2, s1 + +// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 + +// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], dfmt:15, nfmt:0, ttmp1 + +// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], dfmt:0, nfmt:2, ttmp1 + +// GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], dfmt:15, nfmt:2, ttmp1 + +// GFX90A: ds_add_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a2 offset:65535 + +// GFX90A: ds_add_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v255, a2 offset:65535 + +// GFX90A: ds_add_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a255 offset:65535 + +// GFX90A: ds_add_u32 v1, a2 ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a2 + +// GFX90A: ds_add_u32 v1, a2 ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a2 + +// GFX90A: ds_add_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a2 offset:4 + +// GFX90A: ds_add_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_sub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a2 offset:65535 + +// GFX90A: ds_sub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v255, a2 offset:65535 + +// GFX90A: ds_sub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a255 offset:65535 + +// GFX90A: ds_sub_u32 v1, a2 ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a2 + +// GFX90A: ds_sub_u32 v1, a2 ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a2 + +// GFX90A: ds_sub_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a2 offset:4 + +// GFX90A: ds_sub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_rsub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a2 offset:65535 + +// GFX90A: ds_rsub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v255, a2 offset:65535 + +// GFX90A: ds_rsub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a255 offset:65535 + +// GFX90A: ds_rsub_u32 v1, a2 ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a2 + +// GFX90A: ds_rsub_u32 v1, a2 ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a2 + +// GFX90A: ds_rsub_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a2 offset:4 + +// GFX90A: ds_rsub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_inc_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a2 offset:65535 + +// GFX90A: ds_inc_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v255, a2 offset:65535 + +// GFX90A: ds_inc_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a255 offset:65535 + +// GFX90A: ds_inc_u32 v1, a2 ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a2 + +// GFX90A: ds_inc_u32 v1, a2 ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a2 + +// GFX90A: ds_inc_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a2 offset:4 + +// GFX90A: ds_inc_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_dec_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a2 offset:65535 + +// GFX90A: ds_dec_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v255, a2 offset:65535 + +// GFX90A: ds_dec_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a255 offset:65535 + +// GFX90A: ds_dec_u32 v1, a2 ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a2 + +// GFX90A: ds_dec_u32 v1, a2 ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a2 + +// GFX90A: ds_dec_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a2 offset:4 + +// GFX90A: ds_dec_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_min_i32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a2 offset:65535 + +// GFX90A: ds_min_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v255, a2 offset:65535 + +// GFX90A: ds_min_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a255 offset:65535 + +// GFX90A: ds_min_i32 v1, a2 ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a2 + +// GFX90A: ds_min_i32 v1, a2 ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a2 + +// GFX90A: ds_min_i32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a2 offset:4 + +// GFX90A: ds_min_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i32 v1, a2 offset:65535 gds + +// GFX90A: ds_max_i32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a2 offset:65535 + +// GFX90A: ds_max_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v255, a2 offset:65535 + +// GFX90A: ds_max_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a255 offset:65535 + +// GFX90A: ds_max_i32 v1, a2 ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a2 + +// GFX90A: ds_max_i32 v1, a2 ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a2 + +// GFX90A: ds_max_i32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a2 offset:4 + +// GFX90A: ds_max_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i32 v1, a2 offset:65535 gds + +// GFX90A: ds_min_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a2 offset:65535 + +// GFX90A: ds_min_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v255, a2 offset:65535 + +// GFX90A: ds_min_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a255 offset:65535 + +// GFX90A: ds_min_u32 v1, a2 ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a2 + +// GFX90A: ds_min_u32 v1, a2 ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a2 + +// GFX90A: ds_min_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a2 offset:4 + +// GFX90A: ds_min_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_max_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a2 offset:65535 + +// GFX90A: ds_max_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v255, a2 offset:65535 + +// GFX90A: ds_max_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a255 offset:65535 + +// GFX90A: ds_max_u32 v1, a2 ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a2 + +// GFX90A: ds_max_u32 v1, a2 ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a2 + +// GFX90A: ds_max_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a2 offset:4 + +// GFX90A: ds_max_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u32 v1, a2 offset:65535 gds + +// GFX90A: ds_and_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a2 offset:65535 + +// GFX90A: ds_and_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v255, a2 offset:65535 + +// GFX90A: ds_and_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a255 offset:65535 + +// GFX90A: ds_and_b32 v1, a2 ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a2 + +// GFX90A: ds_and_b32 v1, a2 ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a2 + +// GFX90A: ds_and_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a2 offset:4 + +// GFX90A: ds_and_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b32 v1, a2 offset:65535 gds + +// GFX90A: ds_or_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a2 offset:65535 + +// GFX90A: ds_or_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v255, a2 offset:65535 + +// GFX90A: ds_or_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a255 offset:65535 + +// GFX90A: ds_or_b32 v1, a2 ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a2 + +// GFX90A: ds_or_b32 v1, a2 ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a2 + +// GFX90A: ds_or_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a2 offset:4 + +// GFX90A: ds_or_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b32 v1, a2 offset:65535 gds + +// GFX90A: ds_xor_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a2 offset:65535 + +// GFX90A: ds_xor_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v255, a2 offset:65535 + +// GFX90A: ds_xor_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a255 offset:65535 + +// GFX90A: ds_xor_b32 v1, a2 ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a2 + +// GFX90A: ds_xor_b32 v1, a2 ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a2 + +// GFX90A: ds_xor_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a2 offset:4 + +// GFX90A: ds_xor_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b32 v1, a2 offset:65535 gds + +// GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a3 offset:65535 + +// GFX90A: ds_mskor_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v255, a2, a3 offset:65535 + +// GFX90A: ds_mskor_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a255, a3 offset:65535 + +// GFX90A: ds_mskor_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a255 offset:65535 + +// GFX90A: ds_mskor_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a3 + +// GFX90A: ds_mskor_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a3 + +// GFX90A: ds_mskor_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a3 offset:4 + +// GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b32 v1, a2, a3 offset:65535 gds + +// GFX90A: ds_write_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a2 offset:65535 + +// GFX90A: ds_write_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v255, a2 offset:65535 + +// GFX90A: ds_write_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a255 offset:65535 + +// GFX90A: ds_write_b32 v1, a2 ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a2 + +// GFX90A: ds_write_b32 v1, a2 ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a2 + +// GFX90A: ds_write_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a2 offset:4 + +// GFX90A: ds_write_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b32 v1, a2 offset:65535 gds + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v255, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a255, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a255 offset0:127 offset1:255 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset1:255 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset1:255 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:16 offset1:255 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:127 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:127 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:127 offset1:1 + +// GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:127 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:127 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1 + +// GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds + +// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v255, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a255, a3 offset:65535 + +// GFX90A: ds_cmpst_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a255 offset:65535 + +// GFX90A: ds_cmpst_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a3 + +// GFX90A: ds_cmpst_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a3 + +// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a3 offset:4 + +// GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b32 v1, a2, a3 offset:65535 gds + +// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_f32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v255, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_f32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a255, a3 offset:65535 + +// GFX90A: ds_cmpst_f32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a255 offset:65535 + +// GFX90A: ds_cmpst_f32 v1, a2, a3 ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a3 + +// GFX90A: ds_cmpst_f32 v1, a2, a3 ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a3 + +// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a3 offset:4 + +// GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f32 v1, a2, a3 offset:65535 gds + +// GFX90A: ds_min_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a2 offset:65535 + +// GFX90A: ds_min_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v255, a2 offset:65535 + +// GFX90A: ds_min_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a255 offset:65535 + +// GFX90A: ds_min_f32 v1, a2 ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a2 + +// GFX90A: ds_min_f32 v1, a2 ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a2 + +// GFX90A: ds_min_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a2 offset:4 + +// GFX90A: ds_min_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f32 v1, a2 offset:65535 gds + +// GFX90A: ds_max_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a2 offset:65535 + +// GFX90A: ds_max_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v255, a2 offset:65535 + +// GFX90A: ds_max_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a255 offset:65535 + +// GFX90A: ds_max_f32 v1, a2 ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a2 + +// GFX90A: ds_max_f32 v1, a2 ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a2 + +// GFX90A: ds_max_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a2 offset:4 + +// GFX90A: ds_max_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f32 v1, a2 offset:65535 gds + +// GFX90A: ds_add_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a2 offset:65535 + +// GFX90A: ds_add_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v255, a2 offset:65535 + +// GFX90A: ds_add_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a255 offset:65535 + +// GFX90A: ds_add_f32 v1, a2 ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a2 + +// GFX90A: ds_add_f32 v1, a2 ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a2 + +// GFX90A: ds_add_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a2 offset:4 + +// GFX90A: ds_add_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_f32 v1, a2 offset:65535 gds + +// GFX90A: ds_write_b8 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a2 offset:65535 + +// GFX90A: ds_write_b8 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v255, a2 offset:65535 + +// GFX90A: ds_write_b8 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a255 offset:65535 + +// GFX90A: ds_write_b8 v1, a2 ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a2 + +// GFX90A: ds_write_b8 v1, a2 ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a2 + +// GFX90A: ds_write_b8 v1, a2 offset:4 ; encoding: [0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a2 offset:4 + +// GFX90A: ds_write_b8 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8 v1, a2 offset:65535 gds + +// GFX90A: ds_write_b16 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a2 offset:65535 + +// GFX90A: ds_write_b16 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v255, a2 offset:65535 + +// GFX90A: ds_write_b16 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a255 offset:65535 + +// GFX90A: ds_write_b16 v1, a2 ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a2 + +// GFX90A: ds_write_b16 v1, a2 ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a2 + +// GFX90A: ds_write_b16 v1, a2 offset:4 ; encoding: [0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a2 offset:4 + +// GFX90A: ds_write_b16 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16 v1, a2 offset:65535 gds + +// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_add_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_add_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_add_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_add_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a2 + +// GFX90A: ds_add_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a2 + +// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_sub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_sub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_sub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_sub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a2 + +// GFX90A: ds_sub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a2 + +// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_rsub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_rsub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a2 + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a2 + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_inc_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_inc_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_inc_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_inc_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a2 + +// GFX90A: ds_inc_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a2 + +// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_dec_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_dec_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_dec_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_dec_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a2 + +// GFX90A: ds_dec_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a2 + +// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a255, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v255, a2 offset:65535 + +// GFX90A: ds_min_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a255 offset:65535 + +// GFX90A: ds_min_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a2 + +// GFX90A: ds_min_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a2 + +// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a2 offset:4 + +// GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a255, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v255, a2 offset:65535 + +// GFX90A: ds_max_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a255 offset:65535 + +// GFX90A: ds_max_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a2 + +// GFX90A: ds_max_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a2 + +// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a2 offset:4 + +// GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_min_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_min_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a2 + +// GFX90A: ds_min_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a2 + +// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a255, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v255, a2 offset:65535 + +// GFX90A: ds_max_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a255 offset:65535 + +// GFX90A: ds_max_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a2 + +// GFX90A: ds_max_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a2 + +// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a2 offset:4 + +// GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_and_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_and_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_and_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_and_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a2 + +// GFX90A: ds_and_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a2 + +// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_or_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_or_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_or_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_or_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a2 + +// GFX90A: ds_or_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a2 + +// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_xor_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_xor_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_xor_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_xor_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a2 + +// GFX90A: ds_xor_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a2 + +// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 + +// GFX90A: ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535 + +// GFX90A: ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4 + +// GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a2 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a2 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 + +// GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 + +// GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a3 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a3 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4 + +// GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a3 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a3 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4 + +// GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds + +// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a255, v1, a2 offset:65535 + +// GFX90A: ds_min_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v255, a2 offset:65535 + +// GFX90A: ds_min_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a255 offset:65535 + +// GFX90A: ds_min_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a2 + +// GFX90A: ds_min_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a2 + +// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a2 offset:4 + +// GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a255, v1, a2 offset:65535 + +// GFX90A: ds_max_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v255, a2 offset:65535 + +// GFX90A: ds_max_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a255 offset:65535 + +// GFX90A: ds_max_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a2 + +// GFX90A: ds_max_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a2 + +// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a2 offset:4 + +// GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 + +// GFX90A: ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535 + +// GFX90A: ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4 + +// GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds + +// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a2 offset:65535 + +// GFX90A: ds_add_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a255, v1, a2 offset:65535 + +// GFX90A: ds_add_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v255, a2 offset:65535 + +// GFX90A: ds_add_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a255 offset:65535 + +// GFX90A: ds_add_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a2 + +// GFX90A: ds_add_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a2 + +// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a2 offset:4 + +// GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_f32 a5, v1, a2 offset:65535 gds + +// GFX90A: ds_read_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v1 offset:65535 + +// GFX90A: ds_read_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a255, v1 offset:65535 + +// GFX90A: ds_read_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v255 offset:65535 + +// GFX90A: ds_read_b32 a5, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v1 + +// GFX90A: ds_read_b32 a5, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v1 + +// GFX90A: ds_read_b32 a5, v1 offset:4 ; encoding: [0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v1 offset:4 + +// GFX90A: ds_read_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b32 a5, v1 offset:65535 gds + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[254:255], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v255 offset0:127 offset1:255 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset1:255 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset1:255 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:16 offset1:255 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:127 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:127 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:127 offset1:1 + +// GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset1:255 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset1:255 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:127 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:127 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1 + +// GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds + +// GFX90A: ds_read_i8 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v1 offset:65535 + +// GFX90A: ds_read_i8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a255, v1 offset:65535 + +// GFX90A: ds_read_i8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v255 offset:65535 + +// GFX90A: ds_read_i8 a5, v1 ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v1 + +// GFX90A: ds_read_i8 a5, v1 ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v1 + +// GFX90A: ds_read_i8 a5, v1 offset:4 ; encoding: [0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v1 offset:4 + +// GFX90A: ds_read_i8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8 a5, v1 offset:65535 gds + +// GFX90A: ds_read_u8 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v1 offset:65535 + +// GFX90A: ds_read_u8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a255, v1 offset:65535 + +// GFX90A: ds_read_u8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v255 offset:65535 + +// GFX90A: ds_read_u8 a5, v1 ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v1 + +// GFX90A: ds_read_u8 a5, v1 ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v1 + +// GFX90A: ds_read_u8 a5, v1 offset:4 ; encoding: [0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v1 offset:4 + +// GFX90A: ds_read_u8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8 a5, v1 offset:65535 gds + +// GFX90A: ds_read_i16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v1 offset:65535 + +// GFX90A: ds_read_i16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a255, v1 offset:65535 + +// GFX90A: ds_read_i16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v255 offset:65535 + +// GFX90A: ds_read_i16 a5, v1 ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v1 + +// GFX90A: ds_read_i16 a5, v1 ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v1 + +// GFX90A: ds_read_i16 a5, v1 offset:4 ; encoding: [0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v1 offset:4 + +// GFX90A: ds_read_i16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i16 a5, v1 offset:65535 gds + +// GFX90A: ds_read_u16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v1 offset:65535 + +// GFX90A: ds_read_u16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a255, v1 offset:65535 + +// GFX90A: ds_read_u16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v255 offset:65535 + +// GFX90A: ds_read_u16 a5, v1 ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v1 + +// GFX90A: ds_read_u16 a5, v1 ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v1 + +// GFX90A: ds_read_u16 a5, v1 offset:4 ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v1 offset:4 + +// GFX90A: ds_read_u16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16 a5, v1 offset:65535 gds + +// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v1 offset:65535 + +// GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a255, v1 offset:65535 + +// GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v255 offset:65535 + +// GFX90A: ds_swizzle_b32 a5, v1 ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v1 + +// GFX90A: ds_swizzle_b32 a5, v1 ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v1 + +// GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00") + +// GFX90A: ds_swizzle_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_swizzle_b32 a5, v1 offset:65535 gds + +// GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v1, a2 + +// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v1, a2 + +// GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_permute_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v1, a2 offset:65535 + +// GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a255, v1, a2 offset:65535 + +// GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v255, a2 offset:65535 + +// GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v1, a255 offset:65535 + +// GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v1, a2 + +// GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v1, a2 + +// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_bpermute_b32 a5, v1, a2 offset:4 + +// GFX90A: ds_add_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_add_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_add_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_add_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[2:3] + +// GFX90A: ds_add_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[2:3] + +// GFX90A: ds_add_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_add_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_sub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_sub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_sub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[2:3] + +// GFX90A: ds_sub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[2:3] + +// GFX90A: ds_sub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_rsub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_rsub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_rsub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[2:3] + +// GFX90A: ds_rsub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[2:3] + +// GFX90A: ds_rsub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_inc_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_inc_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_inc_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[2:3] + +// GFX90A: ds_inc_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[2:3] + +// GFX90A: ds_inc_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_dec_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_dec_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_dec_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[2:3] + +// GFX90A: ds_dec_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[2:3] + +// GFX90A: ds_dec_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_min_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[2:3] offset:65535 + +// GFX90A: ds_min_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v255, a[2:3] offset:65535 + +// GFX90A: ds_min_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[254:255] offset:65535 + +// GFX90A: ds_min_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[2:3] + +// GFX90A: ds_min_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[2:3] + +// GFX90A: ds_min_i64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[2:3] offset:4 + +// GFX90A: ds_min_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_i64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[2:3] offset:65535 + +// GFX90A: ds_max_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v255, a[2:3] offset:65535 + +// GFX90A: ds_max_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[254:255] offset:65535 + +// GFX90A: ds_max_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[2:3] + +// GFX90A: ds_max_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[2:3] + +// GFX90A: ds_max_i64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[2:3] offset:4 + +// GFX90A: ds_max_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_i64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_min_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_min_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_min_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_min_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[2:3] + +// GFX90A: ds_min_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[2:3] + +// GFX90A: ds_min_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_min_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[2:3] offset:65535 + +// GFX90A: ds_max_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v255, a[2:3] offset:65535 + +// GFX90A: ds_max_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[254:255] offset:65535 + +// GFX90A: ds_max_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[2:3] + +// GFX90A: ds_max_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[2:3] + +// GFX90A: ds_max_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[2:3] offset:4 + +// GFX90A: ds_max_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_u64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_and_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[2:3] offset:65535 + +// GFX90A: ds_and_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v255, a[2:3] offset:65535 + +// GFX90A: ds_and_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[254:255] offset:65535 + +// GFX90A: ds_and_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[2:3] + +// GFX90A: ds_and_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[2:3] + +// GFX90A: ds_and_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[2:3] offset:4 + +// GFX90A: ds_and_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_b64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_or_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[2:3] offset:65535 + +// GFX90A: ds_or_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v255, a[2:3] offset:65535 + +// GFX90A: ds_or_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[254:255] offset:65535 + +// GFX90A: ds_or_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[2:3] + +// GFX90A: ds_or_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[2:3] + +// GFX90A: ds_or_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[2:3] offset:4 + +// GFX90A: ds_or_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_b64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[2:3] offset:65535 + +// GFX90A: ds_xor_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v255, a[2:3] offset:65535 + +// GFX90A: ds_xor_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[254:255] offset:65535 + +// GFX90A: ds_xor_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[2:3] + +// GFX90A: ds_xor_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[2:3] + +// GFX90A: ds_xor_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[2:3] offset:4 + +// GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_b64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[4:5] + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[4:5] + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_write_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[2:3] offset:65535 + +// GFX90A: ds_write_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v255, a[2:3] offset:65535 + +// GFX90A: ds_write_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[254:255] offset:65535 + +// GFX90A: ds_write_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[2:3] + +// GFX90A: ds_write_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[2:3] + +// GFX90A: ds_write_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[2:3] offset:4 + +// GFX90A: ds_write_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 + +// GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 + +// GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_min_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[2:3] offset:65535 + +// GFX90A: ds_min_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v255, a[2:3] offset:65535 + +// GFX90A: ds_min_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[254:255] offset:65535 + +// GFX90A: ds_min_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[2:3] + +// GFX90A: ds_min_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[2:3] + +// GFX90A: ds_min_f64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[2:3] offset:4 + +// GFX90A: ds_min_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_f64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[2:3] offset:65535 + +// GFX90A: ds_max_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v255, a[2:3] offset:65535 + +// GFX90A: ds_max_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[254:255] offset:65535 + +// GFX90A: ds_max_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[2:3] + +// GFX90A: ds_max_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[2:3] + +// GFX90A: ds_max_f64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[2:3] offset:4 + +// GFX90A: ds_max_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_f64 v1, a[2:3] offset:65535 gds + +// GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a2 offset:65535 + +// GFX90A: ds_write_b8_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v255, a2 offset:65535 + +// GFX90A: ds_write_b8_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a255 offset:65535 + +// GFX90A: ds_write_b8_d16_hi v1, a2 ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a2 + +// GFX90A: ds_write_b8_d16_hi v1, a2 ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a2 + +// GFX90A: ds_write_b8_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a2 offset:4 + +// GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b8_d16_hi v1, a2 offset:65535 gds + +// GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a2 offset:65535 + +// GFX90A: ds_write_b16_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v255, a2 offset:65535 + +// GFX90A: ds_write_b16_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a255 offset:65535 + +// GFX90A: ds_write_b16_d16_hi v1, a2 ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a2 + +// GFX90A: ds_write_b16_d16_hi v1, a2 ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a2 + +// GFX90A: ds_write_b16_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a2 offset:4 + +// GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b16_d16_hi v1, a2 offset:65535 gds + +// GFX90A: ds_read_u8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v1 offset:65535 + +// GFX90A: ds_read_u8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a255, v1 offset:65535 + +// GFX90A: ds_read_u8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v255 offset:65535 + +// GFX90A: ds_read_u8_d16 a5, v1 ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v1 + +// GFX90A: ds_read_u8_d16 a5, v1 ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v1 + +// GFX90A: ds_read_u8_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v1 offset:4 + +// GFX90A: ds_read_u8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16 a5, v1 offset:65535 gds + +// GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v1 offset:65535 + +// GFX90A: ds_read_u8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a255, v1 offset:65535 + +// GFX90A: ds_read_u8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v255 offset:65535 + +// GFX90A: ds_read_u8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v1 + +// GFX90A: ds_read_u8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v1 + +// GFX90A: ds_read_u8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v1 offset:4 + +// GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u8_d16_hi a5, v1 offset:65535 gds + +// GFX90A: ds_read_i8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v1 offset:65535 + +// GFX90A: ds_read_i8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a255, v1 offset:65535 + +// GFX90A: ds_read_i8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v255 offset:65535 + +// GFX90A: ds_read_i8_d16 a5, v1 ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v1 + +// GFX90A: ds_read_i8_d16 a5, v1 ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v1 + +// GFX90A: ds_read_i8_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v1 offset:4 + +// GFX90A: ds_read_i8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16 a5, v1 offset:65535 gds + +// GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v1 offset:65535 + +// GFX90A: ds_read_i8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a255, v1 offset:65535 + +// GFX90A: ds_read_i8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v255 offset:65535 + +// GFX90A: ds_read_i8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v1 + +// GFX90A: ds_read_i8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v1 + +// GFX90A: ds_read_i8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v1 offset:4 + +// GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_i8_d16_hi a5, v1 offset:65535 gds + +// GFX90A: ds_read_u16_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v1 offset:65535 + +// GFX90A: ds_read_u16_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a255, v1 offset:65535 + +// GFX90A: ds_read_u16_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v255 offset:65535 + +// GFX90A: ds_read_u16_d16 a5, v1 ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v1 + +// GFX90A: ds_read_u16_d16 a5, v1 ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v1 + +// GFX90A: ds_read_u16_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v1 offset:4 + +// GFX90A: ds_read_u16_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16 a5, v1 offset:65535 gds + +// GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v1 offset:65535 + +// GFX90A: ds_read_u16_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a255, v1 offset:65535 + +// GFX90A: ds_read_u16_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v255 offset:65535 + +// GFX90A: ds_read_u16_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v1 + +// GFX90A: ds_read_u16_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v1 + +// GFX90A: ds_read_u16_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v1 offset:4 + +// GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_u16_d16_hi a5, v1 offset:65535 gds + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 + +// GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 + +// GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535 + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535 + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4 + +// GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[2:3] + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[2:3] + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_read_b64 a[6:7], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v1 offset:65535 + +// GFX90A: ds_read_b64 a[254:255], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[254:255], v1 offset:65535 + +// GFX90A: ds_read_b64 a[6:7], v255 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v255 offset:65535 + +// GFX90A: ds_read_b64 a[6:7], v1 ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v1 + +// GFX90A: ds_read_b64 a[6:7], v1 ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v1 + +// GFX90A: ds_read_b64 a[6:7], v1 offset:4 ; encoding: [0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v1 offset:4 + +// GFX90A: ds_read_b64 a[6:7], v1 offset:65535 gds ; encoding: [0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b64 a[6:7], v1 offset:65535 gds + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[252:255], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v255 offset0:127 offset1:255 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset1:255 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset1:255 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:16 offset1:255 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:127 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:127 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:127 offset1:1 + +// GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset1:255 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset1:255 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:127 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:127 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1 + +// GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 + +// GFX90A: ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535 + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535 + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535 + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4 + +// GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds + +// GFX90A: ds_gws_init a1 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_init a1 offset:65535 gds + +// GFX90A: ds_gws_init a255 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_init a255 offset:65535 gds + +// GFX90A: ds_gws_init a1 gds ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_init a1 gds + +// GFX90A: ds_gws_init a1 gds ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_init a1 gds + +// GFX90A: ds_gws_init a1 offset:4 gds ; encoding: [0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_init a1 offset:4 gds + +// GFX90A: ds_gws_sema_br a1 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_sema_br a1 offset:65535 gds + +// GFX90A: ds_gws_sema_br a255 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_sema_br a255 offset:65535 gds + +// GFX90A: ds_gws_sema_br a1 gds ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_sema_br a1 gds + +// GFX90A: ds_gws_sema_br a1 gds ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_sema_br a1 gds + +// GFX90A: ds_gws_sema_br a1 offset:4 gds ; encoding: [0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_sema_br a1 offset:4 gds + +// GFX90A: ds_gws_barrier a1 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_barrier a1 offset:65535 gds + +// GFX90A: ds_gws_barrier a255 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_barrier a255 offset:65535 gds + +// GFX90A: ds_gws_barrier a1 gds ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_barrier a1 gds + +// GFX90A: ds_gws_barrier a1 gds ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_barrier a1 gds + +// GFX90A: ds_gws_barrier a1 offset:4 gds ; encoding: [0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_gws_barrier a1 offset:4 gds + +// GFX90A: ds_consume a5 offset:65535 ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a5 offset:65535 + +// GFX90A: ds_consume a255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a255 offset:65535 + +// GFX90A: ds_consume a5 ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a5 + +// GFX90A: ds_consume a5 ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a5 + +// GFX90A: ds_consume a5 offset:4 ; encoding: [0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a5 offset:4 + +// GFX90A: ds_consume a5 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_consume a5 offset:65535 gds + +// GFX90A: ds_append a5 offset:65535 ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a5 offset:65535 + +// GFX90A: ds_append a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a255 offset:65535 + +// GFX90A: ds_append a5 ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a5 + +// GFX90A: ds_append a5 ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a5 + +// GFX90A: ds_append a5 offset:4 ; encoding: [0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a5 offset:4 + +// GFX90A: ds_append a5 offset:65535 gds ; encoding: [0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_append a5 offset:65535 gds + +// GFX90A: ds_ordered_count a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a5, v1 offset:65535 gds + +// GFX90A: ds_ordered_count a255, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a255, v1 offset:65535 gds + +// GFX90A: ds_ordered_count a5, v255 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a5, v255 offset:65535 gds + +// GFX90A: ds_ordered_count a5, v1 gds ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a5, v1 gds + +// GFX90A: ds_ordered_count a5, v1 gds ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a5, v1 gds + +// GFX90A: ds_ordered_count a5, v1 offset:4 gds ; encoding: [0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_ordered_count a5, v1 offset:4 gds + +// GFX90A: ds_write_b96 v1, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[2:4] offset:65535 + +// GFX90A: ds_write_b96 v255, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v255, a[2:4] offset:65535 + +// GFX90A: ds_write_b96 v1, a[252:254] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[252:254] offset:65535 + +// GFX90A: ds_write_b96 v1, a[2:4] ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[2:4] + +// GFX90A: ds_write_b96 v1, a[2:4] ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[2:4] + +// GFX90A: ds_write_b96 v1, a[2:4] offset:4 ; encoding: [0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[2:4] offset:4 + +// GFX90A: ds_write_b96 v1, a[2:4] offset:65535 gds ; encoding: [0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b96 v1, a[2:4] offset:65535 gds + +// GFX90A: ds_write_b128 v1, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[2:5] offset:65535 + +// GFX90A: ds_write_b128 v255, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v255, a[2:5] offset:65535 + +// GFX90A: ds_write_b128 v1, a[252:255] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[252:255] offset:65535 + +// GFX90A: ds_write_b128 v1, a[2:5] ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[2:5] + +// GFX90A: ds_write_b128 v1, a[2:5] ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[2:5] + +// GFX90A: ds_write_b128 v1, a[2:5] offset:4 ; encoding: [0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[2:5] offset:4 + +// GFX90A: ds_write_b128 v1, a[2:5] offset:65535 gds ; encoding: [0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_write_b128 v1, a[2:5] offset:65535 gds + +// GFX90A: ds_read_b96 a[6:8], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v1 offset:65535 + +// GFX90A: ds_read_b96 a[252:254], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[252:254], v1 offset:65535 + +// GFX90A: ds_read_b96 a[6:8], v255 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v255 offset:65535 + +// GFX90A: ds_read_b96 a[6:8], v1 ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v1 + +// GFX90A: ds_read_b96 a[6:8], v1 ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v1 + +// GFX90A: ds_read_b96 a[6:8], v1 offset:4 ; encoding: [0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v1 offset:4 + +// GFX90A: ds_read_b96 a[6:8], v1 offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b96 a[6:8], v1 offset:65535 gds + +// GFX90A: ds_read_b128 a[6:9], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v1 offset:65535 + +// GFX90A: ds_read_b128 a[252:255], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[252:255], v1 offset:65535 + +// GFX90A: ds_read_b128 a[6:9], v255 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v255 offset:65535 + +// GFX90A: ds_read_b128 a[6:9], v1 ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v1 + +// GFX90A: ds_read_b128 a[6:9], v1 ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v1 + +// GFX90A: ds_read_b128 a[6:9], v1 offset:4 ; encoding: [0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v1 offset:4 + +// GFX90A: ds_read_b128 a[6:9], v1 offset:65535 gds ; encoding: [0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06] +// NOT-GFX90A: error: invalid register class: agpr loads and stores not supported on this GPU +ds_read_b128 a[6:9], v1 offset:65535 gds + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 + +// GFX90A: image_load a252, v[2:5], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a252, v[2:5], s[8:15] dmask:0x1 + +// GFX90A: image_load a5, v[252:255], s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[252:255], s[8:15] dmask:0x1 + +// GFX90A: image_load a5, v[2:5], s[12:19] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[12:19] dmask:0x1 + +// GFX90A: image_load a5, v[2:5], s[92:99] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[92:99] dmask:0x1 + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x2 + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0x3 + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x4 + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0x5 + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0x6 + +// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:8], v[2:5], s[8:15] dmask:0x7 + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x8 + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0x9 + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0xa + +// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:8], v[2:5], s[8:15] dmask:0xb + +// GFX90A: image_load a[6:7], v[2:5], s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:7], v[2:5], s[8:15] dmask:0xc + +// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:8], v[2:5], s[8:15] dmask:0xd + +// GFX90A: image_load a[6:8], v[2:5], s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a[6:8], v[2:5], s[8:15] dmask:0xe + +// GFX90A: image_load a5, v[2:5], s[8:15] ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 glc + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 slc ; encoding: [0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 slc + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 lwe + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 da ; encoding: [0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 da + +// GFX90A: image_load a5, v[2:5], s[8:15] dmask:0x1 d16 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_load a5, v[2:5], s[8:15] dmask:0x1 d16 + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_store a252, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a252, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_store a1, v[252:255], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[252:255], s[12:19] dmask:0x1 unorm + +// GFX90A: image_store a1, v[2:5], s[16:23] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[16:23] dmask:0x1 unorm + +// GFX90A: image_store a1, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x2 unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0x3 unorm + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x4 unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0x5 unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0x6 unorm + +// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:4], v[2:5], s[12:19] dmask:0x7 unorm + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x8 unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0x9 unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0xa unorm + +// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:4], v[2:5], s[12:19] dmask:0xb unorm + +// GFX90A: image_store a[2:3], v[2:5], s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:3], v[2:5], s[12:19] dmask:0xc unorm + +// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:4], v[2:5], s[12:19] dmask:0xd unorm + +// GFX90A: image_store a[2:4], v[2:5], s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:4], v[2:5], s[12:19] dmask:0xe unorm + +// GFX90A: image_store a[2:5], v[2:5], s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a[2:5], v[2:5], s[12:19] dmask:0xf unorm + +// GFX90A: image_store a1, v[2:5], s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] unorm + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm glc + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm slc + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm lwe + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm da + +// GFX90A: image_store a1, v[2:5], s[12:19] dmask:0x1 unorm d16 ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_store a1, v[2:5], s[12:19] dmask:0x1 unorm d16 + +// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_swap a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_swap a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_swap a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_swap a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_swap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_swap a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_cmpswap a[252:253], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[252:253], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_cmpswap a[6:7], v[252:255], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[252:255], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[12:19] dmask:0x3 unorm + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[92:99] dmask:0x3 unorm + +// GFX90A: image_atomic_cmpswap a[6:9], v[2:5], s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:9], v[2:5], s[8:15] dmask:0xf unorm + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm glc + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm slc ; encoding: [0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm slc + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm lwe ; encoding: [0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm lwe + +// GFX90A: image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm da ; encoding: [0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_cmpswap a[6:7], v[2:5], s[8:15] dmask:0x3 unorm da + +// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_add a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_add a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_add a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_add a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_add a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_add a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_sub a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_sub a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_sub a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_sub a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_sub a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_sub a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smin a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smin a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smin a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_smin a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_smin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smin a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umin a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umin a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umin a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_umin a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_umin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umin a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smax a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smax a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_smax a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_smax a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_smax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_smax a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umax a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umax a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_umax a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_umax a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_umax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_umax a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_and a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_and a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_and a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_and a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_and a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_and a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_or a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_or a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_or a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_or a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_or a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_or a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_xor a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_xor a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_xor a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_xor a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_xor a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_xor a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_inc a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_inc a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_inc a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_inc a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_inc a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_inc a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_dec a252, v[2:5], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a252, v[2:5], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_dec a5, v[252:255], s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[252:255], s[8:15] dmask:0x1 unorm + +// GFX90A: image_atomic_dec a5, v[2:5], s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[12:19] dmask:0x1 unorm + +// GFX90A: image_atomic_dec a5, v[2:5], s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[92:99] dmask:0x1 unorm + +// GFX90A: image_atomic_dec a[6:7], v[2:5], s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a[6:7], v[2:5], s[8:15] dmask:0x3 unorm + +// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm glc + +// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm slc + +// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm lwe + +// GFX90A: image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_atomic_dec a5, v[2:5], s[8:15] dmask:0x1 unorm da + +// GFX90A: image_sample a5, v[0:3], s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00] +// NOT-GFX90A: error: operands are not valid for this GPU or mode +image_sample a5, v[0:3], s[8:15], s[12:15] dmask:0x1 Index: llvm/test/MC/AMDGPU/hsa-diag-v3.s =================================================================== --- llvm/test/MC/AMDGPU/hsa-diag-v3.s +++ llvm/test/MC/AMDGPU/hsa-diag-v3.s @@ -1,6 +1,7 @@ // RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX8,NONGFX10,AMDHSA // RUN: not llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,GFX10,AMDHSA // RUN: not llvm-mc -triple amdgcn-amd- -mcpu=gfx803 -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GCN,NONAMDHSA +// RUN: not llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -show-encoding %s 2>&1 >/dev/null | FileCheck %s --check-prefixes=GFX90A,NONGFX10,AMDHSA,ALL .text @@ -18,7 +19,6 @@ .end_amdhsa_kernel // GCN-LABEL: warning: test_amdhsa_kernel_empty -// AMDHSA-NOT: error: unknown directive // NONAMDHSA: error: unknown directive .warning "test_amdhsa_kernel_empty" .amdhsa_kernel test_amdhsa_kernel_empty @@ -72,7 +72,88 @@ .amdhsa_next_free_vgpr 0 .end_amdhsa_kernel -// GCN-LABEL: warning: test_amdhsa_wavefront_size32 +// ALL-LABEL: warning: test_amdhsa_accum_offset +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: .amdhsa_next_free_vgpr directive is required +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset" +.amdhsa_kernel test_amdhsa_accum_offset + .amdhsa_accum_offset 4 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_accum_offset_missing +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: .amdhsa_accum_offset directive is required +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset_missing" +.amdhsa_kernel test_amdhsa_accum_offset_missing + .amdhsa_next_free_sgpr 0 + .amdhsa_next_free_vgpr 0 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid0 +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: accum_offset should be in range [4..256] in increments of 4 +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset_invalid0" +.amdhsa_kernel test_amdhsa_accum_offset_invalid0 + .amdhsa_next_free_sgpr 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_accum_offset 0 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid5 +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: accum_offset should be in range [4..256] in increments of 4 +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset_invalid5" +.amdhsa_kernel test_amdhsa_accum_offset_invalid5 + .amdhsa_next_free_sgpr 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_accum_offset 5 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid257 +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: accum_offset should be in range [4..256] in increments of 4 +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset_invalid257" +.amdhsa_kernel test_amdhsa_accum_offset_invalid257 + .amdhsa_next_free_sgpr 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_accum_offset 257 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_accum_offset_invalid8 +// NONGFX9A: error: directive requires gfx90a+ +// GFX90A: error: accum_offset exceeds total VGPR allocation +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_accum_offset_invalid8" +.amdhsa_kernel test_amdhsa_accum_offset_invalid8 + .amdhsa_next_free_sgpr 0 + .amdhsa_next_free_vgpr 0 + .amdhsa_accum_offset 8 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_tg_split +// NONGFX90A: error: directive requires gfx90a+ +// GFX90A: error: .amdhsa_next_free_vgpr directive is required +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_tg_split" +.amdhsa_kernel test_amdhsa_tg_split + .amdhsa_tg_split 1 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_tg_split_invalid +// NONGFX90A: error: directive requires gfx90a+ +// GFX90A: error: value out of range +// NONAMDHSA: error: unknown directive +.warning "test_amdhsa_tg_split_invalid" +.amdhsa_kernel test_amdhsa_tg_split_invalid + .amdhsa_tg_split 5 +.end_amdhsa_kernel + +// ALL-LABEL: warning: test_amdhsa_wavefront_size32 // NONGFX10: error: directive requires gfx10+ // GFX10: error: .amdhsa_next_free_vgpr directive is required // NONAMDHSA: error: unknown directive Index: llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/hsa-gfx90a-v3.s @@ -0,0 +1,179 @@ +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s +// RUN: llvm-mc --amdhsa-code-object-version=3 -triple amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack -filetype=obj < %s > %t +// RUN: llvm-readobj -elf-output-style=GNU -sections -symbols -relocations %t | FileCheck --check-prefix=READOBJ %s +// RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s + +// big endian not supported +// XFAIL: powerpc-, powerpc64-, s390x, mips-, mips64-, sparc + +// READOBJ: Section Headers +// READOBJ: .text PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9a-f]+}} {{[0-9]+}} AX {{[0-9]+}} {{[0-9]+}} 256 +// READOBJ: .rodata PROGBITS {{[0-9a-f]+}} {{[0-9a-f]+}} 000080 {{[0-9]+}} A {{[0-9]+}} {{[0-9]+}} 64 + +// READOBJ: Relocation section '.rela.rodata' at offset +// READOBJ: 0000000000000010 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 10 +// READOBJ: 0000000000000050 {{[0-9a-f]+}}00000005 R_AMDGPU_REL64 0000000000000000 .text + 110 + +// READOBJ: Symbol table '.symtab' contains {{[0-9]+}} entries: +// READOBJ-DAG: {{[0-9]+}}: 0000000000000100 0 FUNC LOCAL PROTECTED 2 complete +// READOBJ-DAG: {{[0-9]+}}: 0000000000000040 64 OBJECT LOCAL DEFAULT 3 complete.kd +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 0 FUNC LOCAL PROTECTED 2 minimal +// READOBJ-DAG: {{[0-9]+}}: 0000000000000000 64 OBJECT LOCAL DEFAULT 3 minimal.kd + +// OBJDUMP: Contents of section .rodata +// Note, relocation for KERNEL_CODE_ENTRY_BYTE_OFFSET is not resolved here. +// minimal +// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0030 0000ac00 80000000 00000000 00000000 +// complete +// OBJDUMP-NEXT: 0040 01000000 01000000 00000000 00000000 +// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000 +// OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000100 +// OBJDUMP-NEXT: 0070 c1500104 1f0f007f 7f000000 00000000 + +.text +// ASM: .text + +.amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack" +// ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx90a+xnack" + +.p2align 8 +.type minimal,@function +minimal: + s_endpgm + +.p2align 8 +.type complete,@function +complete: + s_endpgm + +.rodata +// ASM: .rodata + +// Test that only specifying required directives is allowed, and that defaulted +// values are omitted. +.p2align 6 +.amdhsa_kernel minimal + .amdhsa_next_free_vgpr 0 + .amdhsa_next_free_sgpr 0 + .amdhsa_accum_offset 4 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel minimal +// ASM: .amdhsa_next_free_vgpr 0 +// ASM-NEXT: .amdhsa_next_free_sgpr 0 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM: .amdhsa_tg_split 0 +// ASM: .end_amdhsa_kernel + +// Test that we can specify all available directives with non-default values. +.p2align 6 +.amdhsa_kernel complete + .amdhsa_group_segment_fixed_size 1 + .amdhsa_private_segment_fixed_size 1 + .amdhsa_user_sgpr_private_segment_buffer 1 + .amdhsa_user_sgpr_dispatch_ptr 1 + .amdhsa_user_sgpr_queue_ptr 1 + .amdhsa_user_sgpr_kernarg_segment_ptr 1 + .amdhsa_user_sgpr_dispatch_id 1 + .amdhsa_user_sgpr_flat_scratch_init 1 + .amdhsa_user_sgpr_private_segment_size 1 + .amdhsa_system_sgpr_private_segment_wavefront_offset 1 + .amdhsa_system_sgpr_workgroup_id_x 0 + .amdhsa_system_sgpr_workgroup_id_y 1 + .amdhsa_system_sgpr_workgroup_id_z 1 + .amdhsa_system_sgpr_workgroup_info 1 + .amdhsa_system_vgpr_workitem_id 1 + .amdhsa_next_free_vgpr 9 + .amdhsa_next_free_sgpr 27 + .amdhsa_accum_offset 4 + .amdhsa_reserve_vcc 0 + .amdhsa_reserve_flat_scratch 0 + .amdhsa_float_round_mode_32 1 + .amdhsa_float_round_mode_16_64 1 + .amdhsa_float_denorm_mode_32 1 + .amdhsa_float_denorm_mode_16_64 0 + .amdhsa_dx10_clamp 0 + .amdhsa_ieee_mode 0 + .amdhsa_fp16_overflow 1 + .amdhsa_tg_split 1 + .amdhsa_exception_fp_ieee_invalid_op 1 + .amdhsa_exception_fp_denorm_src 1 + .amdhsa_exception_fp_ieee_div_zero 1 + .amdhsa_exception_fp_ieee_overflow 1 + .amdhsa_exception_fp_ieee_underflow 1 + .amdhsa_exception_fp_ieee_inexact 1 + .amdhsa_exception_int_div_zero 1 +.end_amdhsa_kernel + +// ASM: .amdhsa_kernel complete +// ASM-NEXT: .amdhsa_group_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_private_segment_fixed_size 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 +// ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 1 +// ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1 +// ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 1 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 0 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info 1 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id 1 +// ASM-NEXT: .amdhsa_next_free_vgpr 9 +// ASM-NEXT: .amdhsa_next_free_sgpr 27 +// ASM-NEXT: .amdhsa_accum_offset 4 +// ASM-NEXT: .amdhsa_reserve_vcc 0 +// ASM-NEXT: .amdhsa_reserve_flat_scratch 0 +// ASM-NEXT: .amdhsa_float_round_mode_32 1 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 1 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 0 +// ASM-NEXT: .amdhsa_dx10_clamp 0 +// ASM-NEXT: .amdhsa_ieee_mode 0 +// ASM-NEXT: .amdhsa_fp16_overflow 1 +// ASM-NEXT: .amdhsa_tg_split 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op 1 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow 1 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact 1 +// ASM-NEXT: .amdhsa_exception_int_div_zero 1 +// ASM-NEXT: .end_amdhsa_kernel + +.section .foo + +.byte .amdgcn.gfx_generation_number +// ASM: .byte 9 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v7, s10 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 8 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 11 + +.set .amdgcn.next_free_vgpr, 0 +.set .amdgcn.next_free_sgpr, 0 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 0 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 0 + +v_mov_b32_e32 v16, s3 + +.byte .amdgcn.next_free_vgpr +// ASM: .byte 17 +.byte .amdgcn.next_free_sgpr +// ASM: .byte 4 Index: llvm/test/MC/AMDGPU/mai-gfx90a.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/mai-gfx90a.s @@ -0,0 +1,2518 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck -check-prefix=GFX90A %s + +v_accvgpr_read_b32 v2, a0 +// GFX90A: v_accvgpr_read_b32 v2, a0 ; encoding: [0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18] + +v_accvgpr_read_b32 v2, a1 +// GFX90A: v_accvgpr_read_b32 v2, a1 ; encoding: [0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18] + +v_accvgpr_read_b32 v2, a255 +// GFX90A: v_accvgpr_read_b32 v2, a255 ; encoding: [0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18] + +v_accvgpr_read v2, a10 +// GFX90A: v_accvgpr_read_b32 v2, a10 ; encoding: [0x02,0x40,0xd8,0xd3,0x0a,0x01,0x00,0x18] + +v_accvgpr_write_b32 a2, -2.0 +// GFX90A: v_accvgpr_write_b32 a2, -2.0 ; encoding: [0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18] + +v_accvgpr_write_b32 a2, -2 +// GFX90A: v_accvgpr_write_b32 a2, -2 ; encoding: [0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18] + +v_accvgpr_write_b32 a2, v1 +// GFX90A: v_accvgpr_write_b32 a2, v1 ; encoding: [0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18] + +v_accvgpr_write a2, v255 +// GFX90A: v_accvgpr_write_b32 a2, v255 ; encoding: [0x02,0x40,0xd9,0xd3,0xff,0x01,0x00,0x18] + +v_accvgpr_mov_b32 a1, a2 +// GFX90A: v_accvgpr_mov_b32 a1, a2 ; encoding: [0x02,0xa5,0x02,0x7e] + +v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02] + +v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2] + +v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12] + +v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2] + +v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a] + +v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea] + +v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a] + +v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13] + +v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b] + +v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13] + +v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb] + +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9] +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3] +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3] + +v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9] +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3] +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c] + +v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4] + +v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0 +// GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b] + +v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0 +// GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a] Index: llvm/test/MC/AMDGPU/mimg-gfx90a.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/mimg-gfx90a.s @@ -0,0 +1,76 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=GFX90A %s + +image_load v[4:6], v[238:241], s[28:35] dmask:0x7 unorm +// GFX90A: image_load v[4:6], v[238:241], s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00] + +image_load_pck v5, v[0:3], s[8:15] dmask:0x1 glc +// GFX90A: image_load_pck v5, v[0:3], s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00] + +image_load_pck_sgn v5, v[0:3], s[8:15] dmask:0x1 lwe +// GFX90A: image_load_pck_sgn v5, v[0:3], s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00] + +image_load_mip v5, v[0:3], s[8:15] +// GFX90A: image_load_mip v5, v[0:3], s[8:15] ; encoding: [0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00] + +image_load_mip_pck v5, v1, s[8:15] dmask:0x1 +// GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00] + +image_load_mip_pck_sgn v[4:5], v[0:3], s[8:15] dmask:0x5 +// GFX90A: image_load_mip_pck_sgn v[4:5], v[0:3], s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00] + +image_store v[192:194], v[238:241], s[28:35] dmask:0x7 unorm +// GFX90A: image_store v[192:194], v[238:241], s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00] + +image_store_pck v1, v[2:5], s[12:19] dmask:0x1 unorm da +// GFX90A: image_store_pck v1, v[2:5], s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00] + +image_store_mip v1, v[2:5], s[12:19] +// GFX90A: image_store_mip v1, v[2:5], s[12:19] ; encoding: [0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00] + +image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16 +// GFX90A: image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00] + +image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc +// GFX90A: image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 unorm glc +// GFX90A: image_atomic_swap v4, v[192:195], s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 unorm glc +// GFX90A: image_atomic_cmpswap v[4:5], v[192:195], s[28:35] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00] + +image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm +// GFX90A: image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00] + +image_get_resinfo v5, v1, s[8:15] dmask:0x1 +// GFX90A: image_get_resinfo v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00] + +image_sample v5, v[0:3], s[8:15], s[12:15] dmask:0x1 +// GFX90A: image_sample v5, v[0:3], s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00] Index: llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s =================================================================== --- /dev/null +++ llvm/test/MC/AMDGPU/misaligned-vgpr-tuples-err.s @@ -0,0 +1,25 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=GFX90A --implicit-check-not=error: %s + +v_add_f64 v[1:2], v[1:2], v[1:2] +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +global_load_dwordx3 v[1:3], v[0:1], off +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +global_load_dwordx4 v[1:4], v[0:1], off +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +image_load v[1:5], v2, s[0:7] dmask:0xf unorm scc +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +v_mfma_f32_32x32x8f16 a[0:15], a[1:2], v[0:1], a[0:15] +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +v_mfma_i32_4x4x4i8 a[1:4], a0, v1, 2 +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[1:16] +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned + +v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[1:32] +// GFX90A: error: invalid register class: vgpr tuples must be 64 bit aligned Index: llvm/test/MC/AMDGPU/vop_dpp.s =================================================================== --- llvm/test/MC/AMDGPU/vop_dpp.s +++ llvm/test/MC/AMDGPU/vop_dpp.s @@ -116,8 +116,7 @@ //===----------------------------------------------------------------------===// // NOSICI: error: not a valid operand. -// NOGFX9: error: not a valid operand. -// NOVI: error: not a valid operand. +// GCN: v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x00,0x00,0x7e,0x00,0x01,0x09,0xa1] v_nop row_shl:1 row_mask:0xa bank_mask:0x1 bound_ctrl:0 // NOSICI: error: not a valid operand. Index: llvm/test/MC/Disassembler/AMDGPU/dpp64.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AMDGPU/dpp64.txt @@ -0,0 +1,43 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A + +# GFX90A: v_ceil_f64_dpp v[0:1], v[2:3] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff] +0xfa,0x30,0x00,0x7e,0x02,0x51,0x01,0xff + +# GFX90A: v_fmac_f64_dpp v[0:1], v[2:3], v[4:5] row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff] +0xfa,0x08,0x00,0x08,0x02,0x52,0x01,0xff + +# GFX90A: v_cvt_f32_f64_dpp v5, v[0:1] row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff] +0xfa,0x1e,0x0a,0x7e,0x00,0x5f,0x01,0xff + +# GFX90A: v_cvt_i32_f64_dpp v5, v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x06,0x0a,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_cvt_u32_f64_dpp v5, v[0:1] row_newbcast:2 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff] +0xfa,0x2a,0x0a,0x7e,0x00,0x52,0x01,0xff + +# GFX90A: v_floor_f64_dpp v[4:5], v[0:1] row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff] +0xfa,0x34,0x08,0x7e,0x00,0x5f,0x01,0xff + +# GFX90A: v_fract_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x64,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_frexp_exp_i32_f64_dpp v5, v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x60,0x0a,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_frexp_mant_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x62,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_rcp_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x4a,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_rndne_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x32,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_rsq_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x4c,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_sqrt_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x50,0x08,0x7e,0x00,0x51,0x01,0xff + +# GFX90A: v_trunc_f64_dpp v[4:5], v[0:1] row_newbcast:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff] +0xfa,0x2e,0x08,0x7e,0x00,0x51,0x01,0xff Index: llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt @@ -0,0 +1,795 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx908 -disassemble -show-encoding %s | FileCheck --check-prefix=GFX908 %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding %s | FileCheck --check-prefix=GFX90A %s + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04] +0x08,0x00,0xb0,0xd3,0x00,0x01,0x10,0x04 + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04] +0x08,0x60,0xb0,0xd3,0x00,0x01,0x10,0x04 + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0xfc + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc] +0x08,0x47,0xb0,0xd3,0x00,0x01,0x10,0xfc + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x3c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x5c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c] +0x08,0x40,0xb0,0xd3,0x00,0x01,0x10,0x9c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x41,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x42,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0x44,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[8:9], v[0:1], s[0:1], v[4:5] clamp ; encoding: [0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c] +0x08,0xc0,0xb0,0xd3,0x00,0x01,0x10,0x1c + +# GFX90A: v_pk_fma_f32 v[0:1], v[4:5], v[8:9], v[16:17] ; encoding: [0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c] +0x00,0x40,0xb0,0xd3,0x04,0x11,0x42,0x1c + +# GFX90A: v_pk_mul_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +0xfe,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18] +0x04,0x00,0xb1,0xd3,0xfe,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x02,0x20,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x64,0x20,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x66,0x20,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x6a,0x20,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x7e,0x20,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18] +0x04,0x00,0xb1,0xd3,0x08,0xfd,0x03,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18] +0x04,0x00,0xb1,0xd3,0x08,0x05,0x00,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18] +0x04,0x00,0xb1,0xd3,0x08,0xc9,0x00,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18] +0x04,0x00,0xb1,0xd3,0x08,0xcd,0x00,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18] +0x04,0x00,0xb1,0xd3,0x08,0xd5,0x00,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18] +0x04,0x00,0xb1,0xd3,0x08,0xfd,0x00,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x08,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x10,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x18,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x00 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x08 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x10 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x38 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x58 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78] +0x04,0x00,0xb1,0xd3,0x08,0x21,0x02,0x78 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x01,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x02,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x03,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mul_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18] +0x04,0x80,0xb1,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[254:255], v[8:9], v[16:17] ; encoding: [0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +0xfe,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[254:255], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18] +0x04,0x00,0xb2,0xd3,0xfe,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], s[2:3], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x02,0x20,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], s[100:101], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x64,0x20,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], flat_scratch, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x66,0x20,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], vcc, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x6a,0x20,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], exec, v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x7e,0x20,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[254:255] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18] +0x04,0x00,0xb2,0xd3,0x08,0xfd,0x03,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[2:3] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18] +0x04,0x00,0xb2,0xd3,0x08,0x05,0x00,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], s[100:101] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18] +0x04,0x00,0xb2,0xd3,0x08,0xc9,0x00,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], flat_scratch ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18] +0x04,0x00,0xb2,0xd3,0x08,0xcd,0x00,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], vcc ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18] +0x04,0x00,0xb2,0xd3,0x08,0xd5,0x00,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], exec ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18] +0x04,0x00,0xb2,0xd3,0x08,0xfd,0x00,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,0] ; encoding: [0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x08,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[0,1] ; encoding: [0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x10,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel:[1,1] ; encoding: [0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x18,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x00 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x08 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] op_sel_hi:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x10 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,0] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x38 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[0,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x58 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_lo:[1,1] ; encoding: [0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78] +0x04,0x00,0xb2,0xd3,0x08,0x21,0x02,0x78 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,0] ; encoding: [0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x01,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[0,1] ; encoding: [0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x02,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] neg_hi:[1,1] ; encoding: [0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x03,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_add_f32 v[4:5], v[8:9], v[16:17] clamp ; encoding: [0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18] +0x04,0x80,0xb2,0xd3,0x08,0x21,0x02,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18] +0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], flat_scratch, v[4:5] ; encoding: [0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18] +0x00,0x00,0xb3,0xd3,0x66,0x08,0x02,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], vcc ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18] +0x00,0x00,0xb3,0xd3,0x02,0xd5,0x00,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], s[0:1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18] +0x00,0x00,0xb3,0xd3,0x02,0x01,0x00,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1] ; encoding: [0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x10] +0x00,0x00,0xb3,0xd3,0x02,0x09,0x02,0x10 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0] ; encoding: [0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18] +0x00,0x08,0xb3,0xd3,0x02,0x09,0x02,0x18 + +# GFX90A: v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,1] ; encoding: [0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18] +0x00,0x18,0xb3,0xd3,0x02,0x09,0x02,0x18 + +# GFX908: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80] +# GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 scc ; encoding: [0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80] +0x00,0x80,0x09,0xe8,0x00,0x04,0x20,0x80 + +# GFX908: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80] +# GFX90A: tbuffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc scc ; encoding: [0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80] +0x00,0xc0,0x09,0xe8,0x00,0x04,0x20,0x80 + +# GFX908: buffer_load_dword v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03] +# GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 scc ; encoding: [0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03] +0xff,0x8f,0x50,0xe0,0x00,0x05,0x02,0x03 + +# GFX908: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03] +# GFX90A: buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc scc ; encoding: [0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03] +0xff,0xcf,0x50,0xe0,0x00,0x05,0x02,0x03 + +# GFX90A: buffer_wbl2 ; encoding: [0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00] +0x00,0x00,0xa0,0xe0,0x00,0x00,0x00,0x00 + +# GFX90A: buffer_invl2 ; encoding: [0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00] +0x00,0x00,0xa4,0xe0,0x00,0x00,0x00,0x00 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x03,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x18,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x65 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x7c + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0x80 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1] +0xff,0x0f,0x3c,0xe1,0x00,0x04,0x02,0xc1 + +# GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03] +0xff,0x2f,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03] +0xff,0x1f,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03] +0x07,0x00,0x3c,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x3e,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x03,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x18,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x65 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x7c + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0x80 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1] +0xff,0x0f,0x40,0xe1,0x00,0x04,0x02,0xc1 + +# GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03] +0xff,0x2f,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03] +0xff,0x1f,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03] +0x07,0x00,0x40,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x42,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x03,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x18,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x65 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x7c + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0x80 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1] +0xff,0x0f,0x44,0xe1,0x00,0x04,0x02,0xc1 + +# GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03] +0xff,0x2f,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03] +0xff,0x1f,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +0x00,0x00,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03] +0x07,0x00,0x44,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03] +0xff,0x0f,0x46,0xe1,0x00,0x04,0x02,0x03 + +# GFX90A: ds_add_f64 v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00] +0xff,0xff,0xb8,0xd8,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00] +0xff,0xff,0xb8,0xd8,0xff,0x02,0x00,0x00 + +# GFX90A: ds_add_f64 v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00] +0xff,0xff,0xb8,0xd8,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f64 v1, v[2:3] ; encoding: [0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +0x00,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f64 v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00] +0x04,0x00,0xb8,0xd8,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f64 v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00] +0xff,0xff,0xb9,0xd8,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04] +0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[254:255], v1, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe] +0xff,0xff,0xf8,0xd8,0x01,0x02,0x00,0xfe + +# GFX90A: ds_add_rtn_f64 v[4:5], v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04] +0xff,0xff,0xf8,0xd8,0xff,0x02,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[254:255] offset:65535 ; encoding: [0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04] +0xff,0xff,0xf8,0xd8,0x01,0xfe,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] ; encoding: [0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +0x00,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:4 ; encoding: [0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04] +0x04,0x00,0xf8,0xd8,0x01,0x02,0x00,0x04 + +# GFX90A: ds_add_rtn_f64 v[4:5], v1, v[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04] +0xff,0xff,0xf9,0xd8,0x01,0x02,0x00,0x04 + +# GFX908: flat_load_dword v0, v[0:1] ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00] +# GFX90A: flat_load_dword v0, v[0:1] scc ; encoding: [0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00] +0x00,0x00,0x50,0xde,0x00,0x00,0x00,0x00 + +# GFX908: flat_load_dword v0, v[0:1] glc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00] +# GFX90A: flat_load_dword v0, v[0:1] glc scc ; encoding: [0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00] +0x00,0x00,0x51,0xde,0x00,0x00,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x3c,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc scc ; encoding: [0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00] +0xff,0x0f,0x3d,0xdf,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00] +0xff,0x0f,0x3c,0xdd,0xfe,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00] +0xff,0x0f,0x3c,0xdd,0x00,0xfe,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00] +0x07,0x00,0x3c,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x3d,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x3e,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x40,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00] +0xff,0x0f,0x40,0xdd,0xfe,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00] +0xff,0x0f,0x40,0xdd,0x00,0xfe,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x40,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00] +0x07,0x00,0x40,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x41,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x42,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x44,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00] +0xff,0x0f,0x44,0xdd,0xfe,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00] +0xff,0x0f,0x44,0xdd,0x00,0xfe,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] ; encoding: [0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x44,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00] +0x07,0x00,0x44,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x45,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00] +0xff,0x0f,0x46,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: global_atomic_add_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x3c,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_min_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x40,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_max_f64 v[0:1], v[2:3], off ; encoding: [0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x44,0xdd,0x00,0x02,0x7f,0x00 + +# GFX908: image_load v[0:4], v2, s[0:7] dmask:0xf unorm tfe ; encoding: [0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00] +# GFX90A: image_load a0, v2, s[0:7] dmask:0xf unorm scc ; encoding: [0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00] +0x80,0x1f,0x01,0xf0,0x02,0x00,0x00,0x00 + +# GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[4:5] ; encoding: [0x02,0x09,0x08,0x08] +0x02,0x09,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[254:255], v[2:3], v[4:5] ; encoding: [0x02,0x09,0xfc,0x09] +0x02,0x09,0xfc,0x09 + +# GFX90A: v_fmac_f64_e32 v[4:5], v[254:255], v[4:5] ; encoding: [0xfe,0x09,0x08,0x08] +0xfe,0x09,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], flat_scratch, v[4:5] ; encoding: [0x66,0x08,0x08,0x08] +0x66,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], vcc, v[4:5] ; encoding: [0x6a,0x08,0x08,0x08] +0x6a,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], exec, v[4:5] ; encoding: [0x7e,0x08,0x08,0x08] +0x7e,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], 0, v[4:5] ; encoding: [0x80,0x08,0x08,0x08] +0x80,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], -1, v[4:5] ; encoding: [0xc1,0x08,0x08,0x08] +0xc1,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], 0.5, v[4:5] ; encoding: [0xf0,0x08,0x08,0x08] +0xf0,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], -4.0, v[4:5] ; encoding: [0xf7,0x08,0x08,0x08] +0xf7,0x08,0x08,0x08 + +# GFX90A: v_fmac_f64_e32 v[4:5], 0xaf123456, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf] +0xff,0x08,0x08,0x08,0x56,0x34,0x12,0xaf + +# GFX90A: v_fmac_f64_e32 v[4:5], 0x3f717273, v[4:5] ; encoding: [0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f] +0xff,0x08,0x08,0x08,0x73,0x72,0x71,0x3f + +# GFX90A: v_fmac_f64_e32 v[4:5], v[2:3], v[254:255] ; encoding: [0x02,0xfd,0x09,0x08] +0x02,0xfd,0x09,0x08 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[254:255], v[2:3], v[8:9] ; encoding: [0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00] +0xfe,0x00,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[254:255], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00] +0x04,0x00,0x04,0xd1,0xfe,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], flat_scratch, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0x66,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], vcc, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0x6a,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], exec, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0x7e,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], 0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0x80,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], -1, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0xc1,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], 0.5, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0xf0,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], -4.0, v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00] +0x04,0x00,0x04,0xd1,0xf7,0x10,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[254:255] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00] +0x04,0x00,0x04,0xd1,0x02,0xfd,0x03,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], flat_scratch ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00] +0x04,0x00,0x04,0xd1,0x02,0xcd,0x00,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], vcc ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00] +0x04,0x00,0x04,0xd1,0x02,0xd5,0x00,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], exec ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00] +0x04,0x00,0x04,0xd1,0x02,0xfd,0x00,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00] +0x04,0x00,0x04,0xd1,0x02,0x01,0x01,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -1 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00] +0x04,0x00,0x04,0xd1,0x02,0x83,0x01,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], 0.5 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00] +0x04,0x00,0x04,0xd1,0x02,0xe1,0x01,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -4.0 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00] +0x04,0x00,0x04,0xd1,0x02,0xef,0x01,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x20 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x40 + +# GFX90A: v_fmac_f64_e64 v[4:5], -v[2:3], -v[8:9] ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x60 + +# GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, v[8:9] ; encoding: [0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00] +0x04,0x01,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], |v[8:9]| ; encoding: [0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00] +0x04,0x02,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], |v[2:3]|, |v[8:9]| ; encoding: [0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00] +0x04,0x03,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] clamp ; encoding: [0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00] +0x04,0x80,0x04,0xd1,0x02,0x11,0x02,0x00 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x08 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x10 + +# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18] +0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00] +0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00] +0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00] +0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00] +0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00] +0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00] +0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40 + +# GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60 + +# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00] +0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00] +0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00] +0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00] +0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10 + +# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18] +0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18 + +# GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff] +0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff + +# GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:7 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff] +0xfa,0x36,0x0c,0x2a,0x1d,0x57,0x01,0xff + +# GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff] +0xfa,0x36,0x0c,0x2a,0x1d,0x5f,0x01,0xff + +# GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80] +0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80 + +# GFX90A: buffer_atomic_add_f32 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80] +0x00,0x60,0x34,0xe1,0x02,0x00,0x01,0x80 + +# GFX90A: buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc ; encoding: [0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80] +0x00,0x60,0x38,0xe1,0x02,0x00,0x01,0x80 + +# GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x35,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x3d,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x45,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; encoding: [0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00] +0x00,0x80,0x41,0xdd,0x00,0x02,0x7f,0x00 + +# GFX90A: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x3d,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x45,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00] +0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00 Index: llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt @@ -0,0 +1,8395 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding %s | FileCheck --check-prefix=GFX90A %s + +# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x40,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_ubyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x40,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a5, v[2:3] ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a5, v[2:3] ; encoding: [0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x40,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x40,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x41,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x42,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x44,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_sbyte a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x44,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] ; encoding: [0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x44,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x44,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x45,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x46,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x48,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_ushort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x48,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] ; encoding: [0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x48,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x48,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x49,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ushort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x4a,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x4c,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_sshort a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x4c,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] ; encoding: [0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x4c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x4d,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sshort a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x4e,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x50,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_dword a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x50,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] ; encoding: [0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x50,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x50,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x51,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dword a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x52,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[254:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe] +0xff,0x0f,0x54,0xdc,0x02,0x00,0x80,0xfe + +# GFX90A: flat_load_dwordx2 a[6:7], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06] +0xff,0x0f,0x54,0xdc,0xfe,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] ; encoding: [0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x54,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:7 ; encoding: [0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06] +0x07,0x00,0x54,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x55,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx2 a[6:7], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x56,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[252:254], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc] +0xff,0x0f,0x58,0xdc,0x02,0x00,0x80,0xfc + +# GFX90A: flat_load_dwordx3 a[6:8], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06] +0xff,0x0f,0x58,0xdc,0xfe,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] ; encoding: [0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x58,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:7 ; encoding: [0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06] +0x07,0x00,0x58,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x59,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx3 a[6:8], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x5a,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[252:255], v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc] +0xff,0x0f,0x5c,0xdc,0x02,0x00,0x80,0xfc + +# GFX90A: flat_load_dwordx4 a[6:9], v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06] +0xff,0x0f,0x5c,0xdc,0xfe,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] ; encoding: [0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +0x00,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:7 ; encoding: [0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06] +0x07,0x00,0x5c,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x5d,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_load_dwordx4 a[6:9], v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06] +0xff,0x0f,0x5e,0xdc,0x02,0x00,0x80,0x06 + +# GFX90A: flat_store_byte v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x60,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x60,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00] +0xff,0x0f,0x60,0xdc,0x02,0xff,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a2 ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a2 ; encoding: [0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x60,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x60,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x61,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x62,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x64,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x64,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00] +0xff,0x0f,0x64,0xdc,0x02,0xff,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x64,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x64,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x65,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_byte_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x66,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x68,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x68,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00] +0xff,0x0f,0x68,0xdc,0x02,0xff,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 ; encoding: [0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x68,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x68,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x69,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x6a,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x6c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x6c,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00] +0xff,0x0f,0x6c,0xdc,0x02,0xff,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 ; encoding: [0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x6c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x6d,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_short_d16_hi v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x6e,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x70,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[254:255], a2 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x70,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a255 offset:4095 ; encoding: [0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00] +0xff,0x0f,0x70,0xdc,0x02,0xff,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 ; encoding: [0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x70,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 offset:7 ; encoding: [0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x70,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x71,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dword v[2:3], a2 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x72,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x74,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[254:255], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x74,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[254:255] offset:4095 ; encoding: [0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00] +0xff,0x0f,0x74,0xdc,0x02,0xfe,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] ; encoding: [0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x74,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:7 ; encoding: [0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x74,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x75,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx2 v[2:3], a[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x76,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x78,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[254:255], a[2:4] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x78,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[252:254] offset:4095 ; encoding: [0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00] +0xff,0x0f,0x78,0xdc,0x02,0xfc,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] ; encoding: [0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x78,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:7 ; encoding: [0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x78,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 glc ; encoding: [0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x79,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx3 v[2:3], a[2:4] offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x7a,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x7c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[254:255], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00] +0xff,0x0f,0x7c,0xdc,0xfe,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[252:255] offset:4095 ; encoding: [0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00] +0xff,0x0f,0x7c,0xdc,0x02,0xfc,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] ; encoding: [0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +0x00,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:7 ; encoding: [0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00] +0x07,0x00,0x7c,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x7d,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_store_dwordx4 v[2:3], a[2:5] offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00] +0xff,0x0f,0x7e,0xdc,0x02,0x02,0x80,0x00 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x80,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_ubyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x80,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x80,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x80,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x81,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x82,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x84,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_ubyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x84,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x84,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x84,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x85,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_ubyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x86,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x88,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_sbyte_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x88,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x88,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x88,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x89,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x8a,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x8c,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_sbyte_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x8c,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x8c,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x8d,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_sbyte_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x8e,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x90,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_short_d16 a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x90,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] ; encoding: [0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x90,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x90,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x91,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16 a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x92,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a255, v[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff] +0xff,0x0f,0x94,0xdc,0x02,0x00,0x80,0xff + +# GFX90A: flat_load_short_d16_hi a5, v[254:255] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05] +0xff,0x0f,0x94,0xdc,0xfe,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] ; encoding: [0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +0x00,0x00,0x94,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:7 ; encoding: [0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05] +0x07,0x00,0x94,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x95,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_load_short_d16_hi a5, v[2:3] offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05] +0xff,0x0f,0x96,0xdc,0x02,0x00,0x80,0x05 + +# GFX90A: flat_atomic_swap a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x01,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_cmpswap a0, v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x05,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_add a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x09,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_sub a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x0d,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x11,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umin a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x15,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x19,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umax a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x1d,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_and a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x21,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_or a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x25,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_xor a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x29,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_inc a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x2d,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_dec a0, v[2:3], a2 offset:4095 glc ; encoding: [0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x31,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_swap_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x81,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_cmpswap_x2 a[0:1], v[2:3], a[2:5] offset:4095 glc ; encoding: [0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x85,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_add_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x89,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_sub_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x8d,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x91,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umin_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x95,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x99,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umax_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x9d,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_and_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa1,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_or_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa5,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_xor_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa9,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_inc_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xad,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_dec_x2 a[0:1], v[2:3], a[2:3] offset:4095 glc ; encoding: [0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xb1,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_swap v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x00,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_cmpswap v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x04,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_add v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x08,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_sub v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x0c,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x10,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umin v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x14,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x18,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umax v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x1c,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_and v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x20,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_or v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x24,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_xor v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x28,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_inc v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x2c,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_dec v[2:3], a2 offset:4095 ; encoding: [0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x30,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_swap_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x80,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_cmpswap_x2 v[2:3], a[2:5] offset:4095 ; encoding: [0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x84,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_add_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x88,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_sub_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x8c,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x90,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umin_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x94,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_smax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x98,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_umax_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0x9c,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_and_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa0,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_or_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa4,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_xor_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xa8,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_inc_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xac,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: flat_atomic_dec_x2 v[2:3], a[2:3] offset:4095 ; encoding: [0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00] +0xff,0x0f,0xb0,0xdd,0x02,0x02,0x80,0x00 + +# GFX90A: global_load_ubyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ubyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x40,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_ubyte a5, v[2:3], off ; encoding: [0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x40,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x44,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_sbyte a5, v[2:3], off ; encoding: [0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x44,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ushort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ushort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x48,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_ushort a5, v[2:3], off ; encoding: [0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x48,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sshort a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sshort a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x4c,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_sshort a5, v[2:3], off ; encoding: [0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x4c,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_dword a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_dword a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x50,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_dword a5, v[2:3], off ; encoding: [0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x50,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_dwordx2 a[6:7], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06] +0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_load_dwordx2 a[254:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe] +0xff,0x9f,0x54,0xdc,0x02,0x00,0xff,0xfe + +# GFX90A: global_load_dwordx2 a[6:7], v[2:3], off ; encoding: [0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06] +0x00,0x80,0x54,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_load_dwordx3 a[6:8], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06] +0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_load_dwordx3 a[252:254], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc] +0xff,0x9f,0x58,0xdc,0x02,0x00,0xff,0xfc + +# GFX90A: global_load_dwordx3 a[6:8], v[2:3], off ; encoding: [0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06] +0x00,0x80,0x58,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_load_dwordx4 a[6:9], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06] +0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_load_dwordx4 a[252:255], v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc] +0xff,0x9f,0x5c,0xdc,0x02,0x00,0xff,0xfc + +# GFX90A: global_load_dwordx4 a[6:9], v[2:3], off ; encoding: [0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06] +0x00,0x80,0x5c,0xdc,0x02,0x00,0xff,0x06 + +# GFX90A: global_store_byte v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x60,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_byte v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00] +0xff,0x9f,0x60,0xdc,0x02,0xff,0xff,0x00 + +# GFX90A: global_store_byte v[2:3], a2, off ; encoding: [0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x60,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_byte_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x64,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_byte_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00] +0xff,0x9f,0x64,0xdc,0x02,0xff,0xff,0x00 + +# GFX90A: global_store_byte_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x64,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_short v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x68,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_short v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00] +0xff,0x9f,0x68,0xdc,0x02,0xff,0xff,0x00 + +# GFX90A: global_store_short v[2:3], a2, off ; encoding: [0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x68,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_short_d16_hi v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x6c,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_short_d16_hi v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00] +0xff,0x9f,0x6c,0xdc,0x02,0xff,0xff,0x00 + +# GFX90A: global_store_short_d16_hi v[2:3], a2, off ; encoding: [0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x6c,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dword v[2:3], a2, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x70,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dword v[2:3], a255, off offset:-1 ; encoding: [0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00] +0xff,0x9f,0x70,0xdc,0x02,0xff,0xff,0x00 + +# GFX90A: global_store_dword v[2:3], a2, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x70,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx2 v[2:3], a[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x74,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx2 v[2:3], a[254:255], off offset:-1 ; encoding: [0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00] +0xff,0x9f,0x74,0xdc,0x02,0xfe,0xff,0x00 + +# GFX90A: global_store_dwordx2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x74,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx3 v[2:3], a[2:4], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x78,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx3 v[2:3], a[252:254], off offset:-1 ; encoding: [0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00] +0xff,0x9f,0x78,0xdc,0x02,0xfc,0xff,0x00 + +# GFX90A: global_store_dwordx3 v[2:3], a[2:4], off ; encoding: [0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x78,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx4 v[2:3], a[2:5], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00] +0xff,0x9f,0x7c,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_store_dwordx4 v[2:3], a[252:255], off offset:-1 ; encoding: [0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00] +0xff,0x9f,0x7c,0xdc,0x02,0xfc,0xff,0x00 + +# GFX90A: global_store_dwordx4 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00] +0x00,0x80,0x7c,0xdc,0x02,0x02,0xff,0x00 + +# GFX90A: global_load_ubyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ubyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x80,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_ubyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x80,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_ubyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x84,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_ubyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x84,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x88,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_sbyte_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x88,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_sbyte_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x8c,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_sbyte_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x8c,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_short_d16 a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_short_d16 a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x90,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_short_d16 a5, v[2:3], off ; encoding: [0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x90,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_short_d16_hi a5, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05] +0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_load_short_d16_hi a255, v[2:3], off offset:-1 ; encoding: [0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff] +0xff,0x9f,0x94,0xdc,0x02,0x00,0xff,0xff + +# GFX90A: global_load_short_d16_hi a5, v[2:3], off ; encoding: [0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05] +0x00,0x80,0x94,0xdc,0x02,0x00,0xff,0x05 + +# GFX90A: global_atomic_swap a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x01,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_cmpswap a1, v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x05,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_add a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x09,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_sub a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x0d,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_smin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x11,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_umin a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x15,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_smax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x19,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_umax a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x1d,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_and a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x21,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_or a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x25,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_xor a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x29,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_inc a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x2d,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_dec a1, v[2:3], a2, off glc ; encoding: [0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01] +0x00,0x80,0x31,0xdd,0x02,0x02,0xff,0x01 + +# GFX90A: global_atomic_swap_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x81,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_cmpswap_x2 a[2:3], v[2:3], a[2:5], off glc ; encoding: [0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x85,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_add_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x89,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_sub_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x8d,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_smin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x91,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_umin_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x95,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_smax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x99,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_umax_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0x9d,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_and_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0xa1,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_or_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0xa5,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_xor_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0xa9,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_inc_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0xad,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_dec_x2 a[2:3], v[2:3], a[2:3], off glc ; encoding: [0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02] +0x00,0x80,0xb1,0xdd,0x02,0x02,0xff,0x02 + +# GFX90A: global_atomic_swap v[2:3], a2, off ; encoding: [0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x00,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_cmpswap v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x04,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_add v[2:3], a2, off ; encoding: [0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x08,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_sub v[2:3], a2, off ; encoding: [0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x0c,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_smin v[2:3], a2, off ; encoding: [0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x10,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_umin v[2:3], a2, off ; encoding: [0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x14,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_smax v[2:3], a2, off ; encoding: [0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x18,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_umax v[2:3], a2, off ; encoding: [0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x1c,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_and v[2:3], a2, off ; encoding: [0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x20,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_or v[2:3], a2, off ; encoding: [0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x24,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_xor v[2:3], a2, off ; encoding: [0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x28,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_inc v[2:3], a2, off ; encoding: [0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x2c,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_dec v[2:3], a2, off ; encoding: [0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x30,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_swap_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x80,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_cmpswap_x2 v[2:3], a[2:5], off ; encoding: [0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x84,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_add_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x88,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_sub_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x8c,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_smin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x90,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_umin_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x94,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_smax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x98,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_umax_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0x9c,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_and_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0xa0,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_or_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0xa4,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_xor_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0xa8,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_inc_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0xac,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: global_atomic_dec_x2 v[2:3], a[2:3], off ; encoding: [0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00] +0x00,0x80,0xb0,0xdd,0x02,0x02,0xff,0x00 + +# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x40,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_ubyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_ubyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_ubyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_ubyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_ubyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_ubyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x40,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 ; encoding: [0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x40,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x40,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x40,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x41,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x42,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x44,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_sbyte a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_sbyte a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_sbyte a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_sbyte a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_sbyte a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_sbyte a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x44,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 ; encoding: [0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x44,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x44,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x44,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x45,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x46,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x48,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_ushort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_ushort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_ushort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_ushort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_ushort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_ushort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x48,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 ; encoding: [0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x48,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x48,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x48,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x49,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ushort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x4a,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_sshort a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_sshort a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_sshort a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_sshort a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_sshort a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_sshort a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x4c,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 ; encoding: [0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x4c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x4c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x4c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x4d,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sshort a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x4e,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x50,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_dword a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_dword a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_dword a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_dword a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_dword a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_dword a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x50,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 ; encoding: [0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x50,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x50,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x50,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x51,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dword a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x52,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[254:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe] +0xff,0x5f,0x54,0xdc,0x00,0x00,0x82,0xfe + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xe5,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xe6,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xe7,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xea,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xeb,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], v0, off offset:-1 ; encoding: [0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06] +0xff,0x5f,0x54,0xdc,0x00,0x00,0xff,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 ; encoding: [0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x54,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06] +0xff,0x4f,0x54,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06] +0x00,0x50,0x54,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x55,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx2 a[6:7], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x56,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[252:254], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc] +0xff,0x5f,0x58,0xdc,0x00,0x00,0x82,0xfc + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xe5,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xe6,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xe7,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xea,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xeb,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], v0, off offset:-1 ; encoding: [0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06] +0xff,0x5f,0x58,0xdc,0x00,0x00,0xff,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 ; encoding: [0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x58,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06] +0xff,0x4f,0x58,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06] +0x00,0x50,0x58,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x59,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx3 a[6:8], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x5a,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[252:255], off, s2 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0x82,0xfc + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s101 offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe5,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe6,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xe7,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xea,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xeb,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], v0, off offset:-1 ; encoding: [0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06] +0xff,0x5f,0x5c,0xdc,0x00,0x00,0xff,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 ; encoding: [0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06] +0x00,0x40,0x5c,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:4095 ; encoding: [0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06] +0xff,0x4f,0x5c,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-4096 ; encoding: [0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06] +0x00,0x50,0x5c,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x5d,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_load_dwordx4 a[6:9], off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06] +0xff,0x5f,0x5e,0xdc,0x00,0x00,0x82,0x06 + +# GFX90A: scratch_store_byte off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00] +0xff,0x5f,0x60,0xdc,0x00,0xff,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_byte off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_byte off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_byte off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_byte off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_byte v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x60,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 ; encoding: [0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x60,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x60,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x60,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x61,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x62,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00] +0xff,0x5f,0x64,0xdc,0x00,0xff,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_byte_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x64,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x64,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x64,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x64,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x65,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_byte_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x66,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00] +0xff,0x5f,0x68,0xdc,0x00,0xff,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_short off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_short off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_short off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_short off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_short v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x68,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 ; encoding: [0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x68,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x68,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x68,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x69,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x6a,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0xff,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_short_d16_hi v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x6c,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 ; encoding: [0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x6c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x6c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x6c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x6d,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_short_d16_hi off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x6e,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a255, s3 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00] +0xff,0x5f,0x70,0xdc,0x00,0xff,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s101 offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_dword off, a2, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_dword off, a2, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_dword off, a2, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_dword off, a2, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_dword v0, a2, off offset:-1 ; encoding: [0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x70,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 ; encoding: [0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x70,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 offset:4095 ; encoding: [0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x70,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 offset:-4096 ; encoding: [0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x70,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 offset:-1 glc ; encoding: [0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x71,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dword off, a2, s3 offset:-1 slc ; encoding: [0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x72,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[254:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00] +0xff,0x5f,0x74,0xdc,0x00,0xfe,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s101 offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_dwordx2 v0, a[2:3], off offset:-1 ; encoding: [0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x74,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 ; encoding: [0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x74,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:4095 ; encoding: [0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x74,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-4096 ; encoding: [0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x74,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x75,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx2 off, a[2:3], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x76,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[252:254], s3 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00] +0xff,0x5f,0x78,0xdc,0x00,0xfc,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s101 offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_dwordx3 v0, a[2:4], off offset:-1 ; encoding: [0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x78,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 ; encoding: [0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x78,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:4095 ; encoding: [0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x78,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-4096 ; encoding: [0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x78,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x79,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx3 off, a[2:4], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x7a,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[252:255], s3 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0xfc,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s101 offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe5,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe6,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xe7,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xea,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xeb,0x00 + +# GFX90A: scratch_store_dwordx4 v0, a[2:5], off offset:-1 ; encoding: [0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00] +0xff,0x5f,0x7c,0xdc,0x00,0x02,0xff,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 ; encoding: [0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x40,0x7c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:4095 ; encoding: [0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00] +0xff,0x4f,0x7c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-4096 ; encoding: [0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00] +0x00,0x50,0x7c,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 glc ; encoding: [0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x7d,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_store_dwordx4 off, a[2:5], s3 offset:-1 slc ; encoding: [0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00] +0xff,0x5f,0x7e,0xdc,0x00,0x02,0x83,0x00 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x80,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_ubyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x80,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x80,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x80,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x80,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x81,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x82,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x84,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x84,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x84,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x84,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x84,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x85,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_ubyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x86,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x88,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_sbyte_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x88,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 ; encoding: [0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x88,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x88,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x88,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x89,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x8a,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x8c,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x8c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x8c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x8c,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x8d,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_sbyte_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x8e,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x90,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_short_d16 a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_short_d16 a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x90,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 ; encoding: [0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x90,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x90,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x90,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x91,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16 a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x92,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a255, off, s2 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff] +0xff,0x5f,0x94,0xdc,0x00,0x00,0x82,0xff + +# GFX90A: scratch_load_short_d16_hi a5, off, s101 offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xe5,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xe6,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, flat_scratch_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xe7,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, vcc_lo offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xea,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, vcc_hi offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xeb,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, v0, off offset:-1 ; encoding: [0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05] +0xff,0x5f,0x94,0xdc,0x00,0x00,0xff,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 ; encoding: [0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05] +0x00,0x40,0x94,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:4095 ; encoding: [0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05] +0xff,0x4f,0x94,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-4096 ; encoding: [0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05] +0x00,0x50,0x94,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 glc ; encoding: [0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x95,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: scratch_load_short_d16_hi a5, off, s2 offset:-1 slc ; encoding: [0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05] +0xff,0x5f,0x96,0xdc,0x00,0x00,0x82,0x05 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x00,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_format_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x00,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x00,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x02,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_x a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x01,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x04,0xe0,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x04,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x04,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xy a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x06,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03] +0xff,0x0f,0x08,0xe0,0x00,0xfc,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x08,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x08,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyz a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x0a,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03] +0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x0c,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x0c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_xyzw a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x0e,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x10,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_format_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x10,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x10,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x12,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04] +0xff,0x0f,0x14,0xe0,0x00,0xfe,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x14,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x14,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xy a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x16,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04] +0xff,0x0f,0x18,0xe0,0x00,0xfc,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x18,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x18,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyz a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x1a,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04] +0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x1c,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x1c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_xyzw a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x1e,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x20,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x20,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_x a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x22,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x24,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x24,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xy a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x26,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x28,0xe0,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x28,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x28,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyz a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x2a,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x2c,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x2c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_format_d16_xyzw a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x2e,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x30,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x30,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_x a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x32,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x34,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x34,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xy a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x36,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04] +0xff,0x0f,0x38,0xe0,0x00,0xfe,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x38,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x38,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyz a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x3a,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04] +0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x3c,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x3c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_format_d16_xyzw a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x3e,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x40,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x40,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x42,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x41,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x44,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x44,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x46,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x45,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x48,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_ushort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x48,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x4a,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ushort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x49,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x4c,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_sshort a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x4c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x4e,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sshort a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x4d,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x50,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_dword a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_dword a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_dword a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_dword a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_dword a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_dword a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_dword a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x50,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x52,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dword a5, off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x51,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x54,0xe0,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x54,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x54,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x56,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[252:254], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03] +0xff,0x0f,0x58,0xe0,0x00,0xfc,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x58,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 ; encoding: [0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x58,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx3 a[6:8], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x5a,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03] +0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf0 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7] +0xff,0x0f,0x5c,0xe0,0x00,0x06,0x82,0xf7 + +# GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x2f,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x1f,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +0x00,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03] +0x07,0x00,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03] +0xff,0x4f,0x5c,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_load_dwordx4 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03] +0xff,0x0f,0x5e,0xe0,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x60,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_byte a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_byte a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_byte a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_byte a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_byte a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_byte a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_byte a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x60,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x62,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x64,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x64,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_byte_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x66,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x68,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_short a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_short a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_short a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_short a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_short a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_short a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_short a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x68,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x6a,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x6c,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x6c,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_short_d16_hi a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x6e,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a255, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04] +0xff,0x0f,0x70,0xe0,0x00,0xff,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x84,0x04 + +# GFX90A: buffer_store_dword a1, off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x98,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x65 + +# GFX90A: buffer_store_dword a1, off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x7c + +# GFX90A: buffer_store_dword a1, off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x80 + +# GFX90A: buffer_store_dword a1, off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xc1 + +# GFX90A: buffer_store_dword a1, off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf0 + +# GFX90A: buffer_store_dword a1, off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7] +0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0xf7 + +# GFX90A: buffer_store_dword a1, v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04] +0xff,0x2f,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04] +0xff,0x1f,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +0x00,0x00,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04] +0x07,0x00,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04] +0xff,0x4f,0x70,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dword a1, off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04] +0xff,0x0f,0x72,0xe0,0x00,0x01,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[254:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04] +0xff,0x0f,0x74,0xe0,0x00,0xfe,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x74,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 ; encoding: [0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x74,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx2 a[2:3], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x76,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[252:254], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04] +0xff,0x0f,0x78,0xe0,0x00,0xfc,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x78,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 ; encoding: [0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x78,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx3 a[2:4], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x7a,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[252:255], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04] +0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[16:19], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x84,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[96:99], s4 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x98,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s101 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x65 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], m0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x7c + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0x80 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -1 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xc1 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf0 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7] +0xff,0x0f,0x7c,0xe0,0x00,0x02,0x83,0xf7 + +# GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 idxen offset:4095 ; encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x2f,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], v0, s[12:15], s4 offen offset:4095 ; encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x1f,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 ; encoding: [0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +0x00,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:7 ; encoding: [0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04] +0x07,0x00,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 glc ; encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04] +0xff,0x4f,0x7c,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_store_dwordx4 a[2:5], off, s[12:15], s4 offset:4095 slc ; encoding: [0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04] +0xff,0x0f,0x7e,0xe0,0x00,0x02,0x83,0x04 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x80,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x80,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x82,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x84,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x84,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_ubyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x86,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x88,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x88,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x8a,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x8c,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x8c,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_sbyte_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x8e,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x90,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x90,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x90,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16 a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x92,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03] +0xff,0x0f,0x94,0xe0,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], 0.5 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf0 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], -4.0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7] +0xff,0x0f,0x94,0xe0,0x00,0x05,0x82,0xf7 + +# GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03] +0xff,0x2f,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03] +0xff,0x1f,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +0x00,0x00,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03] +0x07,0x00,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03] +0xff,0x4f,0x94,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_load_short_d16_hi a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03] +0xff,0x0f,0x96,0xe0,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x00,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x00,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x02,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x04,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x04,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x04,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x06,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x08,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_add a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x08,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_add a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x0a,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x0c,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x0c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_sub a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x0e,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x10,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x10,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x12,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x14,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x14,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umin a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x16,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x18,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x18,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_smax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x1a,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x1c,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x1c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_umax a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x1e,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x20,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_and a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x20,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_and a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x22,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x24,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_or a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x24,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_or a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x26,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x28,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x28,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_xor a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x2a,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x2c,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x2c,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_inc a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x2e,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a255, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03] +0xff,0x0f,0x30,0xe1,0x00,0xff,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x83,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x98,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x65 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x7c + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x80 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1] +0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0xc1 + +# GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03] +0xff,0x2f,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03] +0xff,0x1f,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 ; encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +0x00,0x00,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03] +0x07,0x00,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03] +0xff,0x4f,0x30,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_dec a5, off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03] +0xff,0x0f,0x32,0xe1,0x00,0x05,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x80,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x80,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x80,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_swap_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x82,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[252:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03] +0xff,0x0f,0x84,0xe1,0x00,0xfc,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x84,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 ; encoding: [0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x84,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_cmpswap_x2 a[6:9], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x86,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x88,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x88,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x88,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_add_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x8a,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x8c,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x8c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_sub_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x8e,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x90,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x90,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x90,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x92,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x94,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x94,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x94,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umin_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x96,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x98,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x98,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x98,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_smax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x9a,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0x9c,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0x9c,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_umax_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0x9e,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0xa0,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0xa0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_and_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xa2,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0xa4,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0xa4,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_or_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xa6,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0xa8,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0xa8,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_xor_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xaa,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0xac,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0xac,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0xac,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_inc_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xae,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[254:255], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03] +0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[12:15], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x83,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[96:99], s3 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x98,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s101 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x65 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], m0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x7c + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], 0 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0x80 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], -1 offset:4095 ; encoding: [0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1] +0xff,0x0f,0xb0,0xe1,0x00,0x06,0x82,0xc1 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 idxen offset:4095 ; encoding: [0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x2f,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], v0, s[8:11], s3 offen offset:4095 ; encoding: [0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x1f,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 ; encoding: [0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +0x00,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:7 ; encoding: [0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03] +0x07,0x00,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03] +0xff,0x4f,0xb0,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: buffer_atomic_dec_x2 a[6:7], off, s[8:11], s3 offset:4095 slc ; encoding: [0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03] +0xff,0x0f,0xb2,0xe1,0x00,0x06,0x82,0x03 + +# GFX90A: tbuffer_load_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01] +0x00,0x00,0x78,0xe9,0x00,0x01,0x81,0x01 + +# GFX90A: tbuffer_load_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01] +0x00,0x80,0x78,0xe9,0x00,0x02,0x81,0x01 + +# GFX90A: tbuffer_load_format_xyz a[2:4], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01] +0x00,0x00,0x79,0xe9,0x00,0x02,0x81,0x01 + +# GFX90A: tbuffer_load_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01] +0x00,0x80,0x79,0xe9,0x00,0x02,0x81,0x01 + +# GFX90A: tbuffer_store_format_x a1, off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01] +0x00,0x00,0x7a,0xe9,0x00,0x01,0x81,0x01 + +# GFX90A: tbuffer_store_format_xy a[2:3], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01] +0x00,0x80,0x7a,0xe9,0x00,0x02,0x81,0x01 + +# GFX90A: tbuffer_store_format_xyzw a[2:5], off, s[4:7], s1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01] +0x00,0x80,0x7b,0xe9,0x00,0x02,0x81,0x01 + +# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d] +0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d + +# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15] ; encoding: [0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d] +0x00,0x80,0x7b,0xe8,0x00,0x02,0x9c,0x6d + +# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d] +0x00,0x80,0x03,0xe9,0x00,0x02,0x9c,0x6d + +# GFX90A: tbuffer_store_format_xyzw a[2:5], off, ttmp[4:7], ttmp1 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] ; encoding: [0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d] +0x00,0x80,0x7b,0xe9,0x00,0x02,0x9c,0x6d + +# GFX90A: ds_add_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x00,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x00,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_add_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x00,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_add_u32 v1, a2 ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u32 v1, a2 ; encoding: [0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x00,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x00,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x01,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x02,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x02,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x02,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a2 ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a2 ; encoding: [0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x02,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x02,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x03,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x04,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x04,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x04,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a2 ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a2 ; encoding: [0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x04,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x04,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x05,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x06,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x06,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x06,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a2 ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a2 ; encoding: [0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x06,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x06,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x07,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x08,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x08,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x08,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a2 ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a2 ; encoding: [0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x08,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x08,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x09,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x0a,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x0a,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a2 ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a2 ; encoding: [0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x0a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0b,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x0c,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x0c,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a2 ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a2 ; encoding: [0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x0c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0d,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x0e,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x0e,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a2 ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a2 ; encoding: [0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x0e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x0e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x0f,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x10,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x10,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x10,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a2 ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a2 ; encoding: [0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x10,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x10,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x11,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x12,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x12,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x12,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a2 ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a2 ; encoding: [0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x12,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x12,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x13,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x14,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x14,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x14,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a2 ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a2 ; encoding: [0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x14,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x14,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x15,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x16,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x16,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x16,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a2 ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a2 ; encoding: [0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x16,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x16,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x17,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x18,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_mskor_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00] +0xff,0xff,0x18,0xda,0xff,0x02,0x03,0x00 + +# GFX90A: ds_mskor_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00] +0xff,0xff,0x18,0xda,0x01,0xff,0x03,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00] +0xff,0xff,0x18,0xda,0x01,0x02,0xff,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x18,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00] +0x04,0x00,0x18,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_mskor_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x19,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write_b32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x1a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x1a,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x1a,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_write_b32 v1, a2 ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b32 v1, a2 ; encoding: [0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x1a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x1a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x1b,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +0x7f,0xff,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00] +0x7f,0xff,0x1c,0xda,0xff,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00] +0x7f,0xff,0x1c,0xda,0x01,0xff,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00] +0x7f,0xff,0x1c,0xda,0x01,0x02,0xff,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +0x00,0xff,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00] +0x10,0xff,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00] +0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00] +0x7f,0x00,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00] +0x7f,0x01,0x1c,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00] +0x7f,0xff,0x1d,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +0x7f,0xff,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00] +0x7f,0xff,0x1e,0xda,0xff,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00] +0x7f,0xff,0x1e,0xda,0x01,0xff,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00] +0x7f,0xff,0x1e,0xda,0x01,0x02,0xff,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +0x00,0xff,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00] +0x10,0xff,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00] +0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00] +0x7f,0x00,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00] +0x7f,0x01,0x1e,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_write2st64_b32 v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00] +0x7f,0xff,0x1f,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x20,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00] +0xff,0xff,0x20,0xda,0xff,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00] +0xff,0xff,0x20,0xda,0x01,0xff,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00] +0xff,0xff,0x20,0xda,0x01,0x02,0xff,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a3 ; encoding: [0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x20,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00] +0x04,0x00,0x20,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_b32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x21,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x22,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00] +0xff,0xff,0x22,0xda,0xff,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00] +0xff,0xff,0x22,0xda,0x01,0xff,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00] +0xff,0xff,0x22,0xda,0x01,0x02,0xff,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a3 ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a3 ; encoding: [0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +0x00,0x00,0x22,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00] +0x04,0x00,0x22,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_cmpst_f32 v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00] +0xff,0xff,0x23,0xda,0x01,0x02,0x03,0x00 + +# GFX90A: ds_min_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x24,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x24,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x24,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_min_f32 v1, a2 ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f32 v1, a2 ; encoding: [0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x24,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x24,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x25,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x26,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x26,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x26,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a2 ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a2 ; encoding: [0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x26,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x26,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x27,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x2a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x2a,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x2a,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a2 ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a2 ; encoding: [0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x2a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a2 offset:4 ; encoding: [0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x2a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_f32 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x2b,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x3c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x3c,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x3c,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a2 ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a2 ; encoding: [0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x3c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a2 offset:4 ; encoding: [0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x3c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x3d,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x3e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v255, a2 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x3e,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a255 offset:65535 ; encoding: [0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0x3e,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a2 ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a2 ; encoding: [0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x3e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a2 offset:4 ; encoding: [0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x3e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16 v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x3f,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x40,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x40,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_add_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x40,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x40,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x40,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x40,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x41,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x42,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x42,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_sub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x42,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x42,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x42,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x42,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_sub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x43,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x44,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x44,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_rsub_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x44,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x44,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x44,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x44,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_rsub_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x45,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x46,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x46,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_inc_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x46,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x46,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x46,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x46,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_inc_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x47,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x48,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x48,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_dec_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x48,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x48,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x48,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x48,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_dec_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x49,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x4a,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_min_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x4a,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x4a,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x4a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4b,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x4c,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_max_rtn_i32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x4c,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x4c,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a2 ; encoding: [0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x4c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_i32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4d,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x4e,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_min_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x4e,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x4e,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x4e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x4e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x4f,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x50,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x50,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_max_rtn_u32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x50,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x50,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a2 ; encoding: [0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x50,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x50,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_u32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x51,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x52,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x52,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_and_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x52,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x52,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x52,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x52,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_and_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x53,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x54,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x54,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_or_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x54,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x54,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x54,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x54,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_or_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x55,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x56,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x56,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_xor_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x56,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x56,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x56,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x56,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_xor_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x57,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff] +0xff,0xff,0x58,0xda,0x01,0x02,0x05,0xff + +# GFX90A: ds_mskor_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05] +0xff,0xff,0x58,0xda,0xff,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05] +0xff,0xff,0x58,0xda,0x01,0xff,0x03,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x58,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +0x00,0x00,0x58,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05] +0x04,0x00,0x58,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_mskor_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x59,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x5a,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_wrxchg_rtn_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x5a,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x5a,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x5a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x5a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg_rtn_b32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x5b,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe] +0x7f,0xff,0x5c,0xda,0x01,0x02,0x03,0xfe + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06] +0x7f,0xff,0x5c,0xda,0xff,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06] +0x7f,0xff,0x5c,0xda,0x01,0xff,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06] +0x7f,0xff,0x5c,0xda,0x01,0x02,0xff,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +0x00,0xff,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06] +0x10,0xff,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06] +0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06] +0x7f,0x00,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06] +0x7f,0x01,0x5c,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06] +0x7f,0xff,0x5d,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[254:255], v1, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe] +0x7f,0xff,0x5e,0xda,0x01,0x02,0x03,0xfe + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v255, a2, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06] +0x7f,0xff,0x5e,0xda,0xff,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a255, a3 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06] +0x7f,0xff,0x5e,0xda,0x01,0xff,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06] +0x7f,0xff,0x5e,0xda,0x01,0x02,0xff,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset1:255 ; encoding: [0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +0x00,0xff,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06] +0x10,0xff,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06] +0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 ; encoding: [0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06] +0x7f,0x00,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06] +0x7f,0x01,0x5e,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b32 a[6:7], v1, a2, a3 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06] +0x7f,0xff,0x5f,0xda,0x01,0x02,0x03,0x06 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05] +0xff,0xff,0x60,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff] +0xff,0xff,0x60,0xda,0x01,0x02,0x03,0xff + +# GFX90A: ds_cmpst_rtn_b32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05] +0xff,0xff,0x60,0xda,0xff,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05] +0xff,0xff,0x60,0xda,0x01,0xff,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05] +0xff,0xff,0x60,0xda,0x01,0x02,0xff,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +0x00,0x00,0x60,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05] +0x04,0x00,0x60,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_b32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05] +0xff,0xff,0x61,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05] +0xff,0xff,0x62,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a255, v1, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff] +0xff,0xff,0x62,0xda,0x01,0x02,0x03,0xff + +# GFX90A: ds_cmpst_rtn_f32 a5, v255, a2, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05] +0xff,0xff,0x62,0xda,0xff,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05] +0xff,0xff,0x62,0xda,0x01,0xff,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a255 offset:65535 ; encoding: [0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05] +0xff,0xff,0x62,0xda,0x01,0x02,0xff,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 ; encoding: [0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +0x00,0x00,0x62,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:4 ; encoding: [0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05] +0x04,0x00,0x62,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_cmpst_rtn_f32 a5, v1, a2, a3 offset:65535 gds ; encoding: [0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05] +0xff,0xff,0x63,0xda,0x01,0x02,0x03,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x64,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x64,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_min_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x64,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x64,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x64,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x64,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_min_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x65,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x66,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x66,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_max_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x66,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x66,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x66,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x66,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_max_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x67,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a255, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff] +0xff,0xff,0x68,0xda,0x01,0x02,0x05,0xff + +# GFX90A: ds_wrap_rtn_b32 a5, v255, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05] +0xff,0xff,0x68,0xda,0xff,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a255, a3 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05] +0xff,0xff,0x68,0xda,0x01,0xff,0x03,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x68,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 ; encoding: [0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +0x00,0x00,0x68,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:4 ; encoding: [0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05] +0x04,0x00,0x68,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_wrap_rtn_b32 a5, v1, a2, a5 offset:65535 gds ; encoding: [0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05] +0xff,0xff,0x69,0xda,0x01,0x02,0x05,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x6a,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_add_rtn_f32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x6a,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x6a,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a2 ; encoding: [0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x6a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x6a,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_rtn_f32 a5, v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x6b,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_read_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x6c,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x6c,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_b32 a5, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_b32 a5, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x6c,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_b32 a5, v1 offset:4 ; encoding: [0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x6c,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x6d,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe] +0x7f,0xff,0x6e,0xda,0x01,0x00,0x00,0xfe + +# GFX90A: ds_read2_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06] +0x7f,0xff,0x6e,0xda,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06] +0x10,0xff,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06] +0x7f,0x01,0x6e,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0x6f,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe] +0x7f,0xff,0x70,0xda,0x01,0x00,0x00,0xfe + +# GFX90A: ds_read2st64_b32 a[6:7], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06] +0x7f,0xff,0x70,0xda,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset1:255 ; encoding: [0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06] +0x10,0xff,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 ; encoding: [0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06] +0x7f,0x01,0x70,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b32 a[6:7], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0x71,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_i8 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x72,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x72,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_i8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x72,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_i8 a5, v1 ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8 a5, v1 ; encoding: [0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x72,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8 a5, v1 offset:4 ; encoding: [0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x72,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x73,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x74,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x74,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u8 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x74,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a5, v1 ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a5, v1 ; encoding: [0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x74,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a5, v1 offset:4 ; encoding: [0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x74,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x75,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x76,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x76,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_i16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x76,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a5, v1 ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a5, v1 ; encoding: [0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x76,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a5, v1 offset:4 ; encoding: [0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x76,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x77,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x78,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x78,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x78,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a5, v1 ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a5, v1 ; encoding: [0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x78,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a5, v1 offset:4 ; encoding: [0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x78,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x79,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a255, v1 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0x7a,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_swizzle_b32 a5, v255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0x7a,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a5, v1 ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a5, v1 ; encoding: [0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0x7a,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00") ; encoding: [0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0x7a,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_swizzle_b32 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0x7b,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05] +0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff] +0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff + +# GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05] +0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05] +0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_bpermute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05] +0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05 + +# GFX90A: ds_add_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x80,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_add_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x80,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_add_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x80,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x80,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_add_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x81,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x82,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x82,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x82,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x82,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x82,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_sub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x83,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x84,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x84,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x84,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x84,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x84,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_rsub_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x85,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x86,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x86,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x86,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x86,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x86,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_inc_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x87,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x88,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x88,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x88,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x88,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x88,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_dec_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x89,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x8a,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x8a,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x8a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8b,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x8c,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x8c,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[2:3] ; encoding: [0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x8c,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_i64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8d,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x8e,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x8e,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x8e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x8e,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x8f,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x90,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x90,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x90,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[2:3] ; encoding: [0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x90,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x90,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_u64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x91,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x92,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x92,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x92,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x92,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x92,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_and_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x93,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x94,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x94,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x94,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x94,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x94,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_or_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x95,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x96,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x96,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x96,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x96,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x96,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_xor_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x97,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0x98,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_mskor_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00] +0xff,0xff,0x98,0xda,0xff,0x02,0x04,0x00 + +# GFX90A: ds_mskor_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00] +0xff,0xff,0x98,0xda,0x01,0xfe,0x04,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00] +0xff,0xff,0x98,0xda,0x01,0x02,0xfe,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0x98,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00] +0x04,0x00,0x98,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_mskor_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0x99,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write_b64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x9a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0x9a,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0x9a,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_write_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b64 v1, a[2:3] ; encoding: [0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0x9a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0x9a,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0x9b,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +0x7f,0xff,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00] +0x7f,0xff,0x9c,0xda,0xff,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00] +0x7f,0xff,0x9c,0xda,0x01,0xfe,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00] +0x7f,0xff,0x9c,0xda,0x01,0x02,0xfe,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +0x00,0xff,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00] +0x10,0xff,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00] +0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00] +0x7f,0x00,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00] +0x7f,0x01,0x9c,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00] +0x7f,0xff,0x9d,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +0x7f,0xff,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00] +0x7f,0xff,0x9e,0xda,0xff,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00] +0x7f,0xff,0x9e,0xda,0x01,0xfe,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00] +0x7f,0xff,0x9e,0xda,0x01,0x02,0xfe,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +0x00,0xff,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00] +0x10,0xff,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00] +0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00] +0x7f,0x00,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00] +0x7f,0x01,0x9e,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_write2st64_b64 v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00] +0x7f,0xff,0x9f,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0xa0,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00] +0xff,0xff,0xa0,0xda,0xff,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00] +0xff,0xff,0xa0,0xda,0x01,0xfe,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00] +0xff,0xff,0xa0,0xda,0x01,0x02,0xfe,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0xa0,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00] +0x04,0x00,0xa0,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_b64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0xa1,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0xa2,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00] +0xff,0xff,0xa2,0xda,0xff,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00] +0xff,0xff,0xa2,0xda,0x01,0xfe,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00] +0xff,0xff,0xa2,0xda,0x01,0x02,0xfe,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +0x00,0x00,0xa2,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00] +0x04,0x00,0xa2,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_cmpst_f64 v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00] +0xff,0xff,0xa3,0xda,0x01,0x02,0x04,0x00 + +# GFX90A: ds_min_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa4,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0xa4,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_min_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0xa4,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_min_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0xa4,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_min_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa5,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa6,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0xa6,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00] +0xff,0xff,0xa6,0xda,0x01,0xfe,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[2:3] ; encoding: [0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa6,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0xa6,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_max_f64 v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa7,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa8,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0xa8,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0xa8,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a2 ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a2 ; encoding: [0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa8,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0xa8,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b8_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa9,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xaa,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v255, a2 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00] +0xff,0xff,0xaa,0xda,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a255 offset:65535 ; encoding: [0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00] +0xff,0xff,0xaa,0xda,0x01,0xff,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a2 ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a2 ; encoding: [0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xaa,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a2 offset:4 ; encoding: [0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00] +0x04,0x00,0xaa,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b16_d16_hi v1, a2 offset:65535 gds ; encoding: [0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xab,0xda,0x01,0x02,0x00,0x00 + +# GFX90A: ds_read_u8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xac,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xac,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xac,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16 a5, v1 ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16 a5, v1 ; encoding: [0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xac,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xac,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xad,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xae,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xae,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xae,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xae,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xae,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xaf,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xb0,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_i8_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xb0,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a5, v1 ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a5, v1 ; encoding: [0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb0,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xb0,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb1,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xb2,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_i8_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xb2,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb2,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xb2,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_i8_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb3,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xb4,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u16_d16 a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xb4,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a5, v1 ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a5, v1 ; encoding: [0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb4,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a5, v1 offset:4 ; encoding: [0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xb4,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16 a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb5,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a255, v1 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff] +0xff,0xff,0xb6,0xda,0x01,0x00,0x00,0xff + +# GFX90A: ds_read_u16_d16_hi a5, v255 offset:65535 ; encoding: [0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05] +0xff,0xff,0xb6,0xda,0xff,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a5, v1 ; encoding: [0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +0x00,0x00,0xb6,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a5, v1 offset:4 ; encoding: [0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05] +0x04,0x00,0xb6,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_read_u16_d16_hi a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05] +0xff,0xff,0xb7,0xda,0x01,0x00,0x00,0x05 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xc0,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_add_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xc0,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xc0,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xc0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_add_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc1,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xc2,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_sub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xc2,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xc2,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xc2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_sub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc3,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xc4,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xc4,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xc4,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xc4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_rsub_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc5,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xc6,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_inc_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xc6,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xc6,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xc6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_inc_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc7,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xc8,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_dec_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xc8,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xc8,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xc8,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xc8,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_dec_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xc9,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xca,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xca,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_min_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xca,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xca,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xca,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xca,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xcb,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xcc,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_max_rtn_i64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xcc,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xcc,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xcc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xcc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_i64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xcd,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xce,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xce,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_min_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xce,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xce,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xce,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xce,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xcf,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xd0,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_max_rtn_u64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xd0,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xd0,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xd0,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_u64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd1,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xd2,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_and_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xd2,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xd2,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xd2,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_and_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd3,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xd4,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_or_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xd4,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xd4,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xd4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_or_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd5,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xd6,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_xor_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xd6,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xd6,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xd6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xd6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_xor_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xd7,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe] +0xff,0xff,0xd8,0xda,0x01,0x02,0x04,0xfe + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06] +0xff,0xff,0xd8,0xda,0xff,0x02,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06] +0xff,0xff,0xd8,0xda,0x01,0xfe,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06] +0xff,0xff,0xd8,0xda,0x01,0x02,0xfe,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xd8,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06] +0x04,0x00,0xd8,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_mskor_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xd9,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xda,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xda,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xda,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xda,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xda,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xda,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xdb,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc] +0x7f,0xff,0xdc,0xda,0x01,0x02,0x04,0xfc + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06] +0x7f,0xff,0xdc,0xda,0xff,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06] +0x7f,0xff,0xdc,0xda,0x01,0xfe,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06] +0x7f,0xff,0xdc,0xda,0x01,0x02,0xfe,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +0x00,0xff,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06] +0x10,0xff,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06] +0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06] +0x7f,0x00,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06] +0x7f,0x01,0xdc,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06] +0x7f,0xff,0xdd,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[252:255], v1, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc] +0x7f,0xff,0xde,0xda,0x01,0x02,0x04,0xfc + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v255, a[2:3], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06] +0x7f,0xff,0xde,0xda,0xff,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[254:255], a[4:5] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06] +0x7f,0xff,0xde,0xda,0x01,0xfe,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[254:255] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06] +0x7f,0xff,0xde,0xda,0x01,0x02,0xfe,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset1:255 ; encoding: [0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +0x00,0xff,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:16 offset1:255 ; encoding: [0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06] +0x10,0xff,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06] +0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 ; encoding: [0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06] +0x7f,0x00,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06] +0x7f,0x01,0xde,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_wrxchg2st64_rtn_b64 a[6:9], v1, a[2:3], a[4:5] offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06] +0x7f,0xff,0xdf,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe] +0xff,0xff,0xe0,0xda,0x01,0x02,0x04,0xfe + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06] +0xff,0xff,0xe0,0xda,0xff,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06] +0xff,0xff,0xe0,0xda,0x01,0xfe,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06] +0xff,0xff,0xe0,0xda,0x01,0x02,0xfe,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xe0,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06] +0x04,0x00,0xe0,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_b64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xe1,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[254:255], v1, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe] +0xff,0xff,0xe2,0xda,0x01,0x02,0x04,0xfe + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v255, a[2:3], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06] +0xff,0xff,0xe2,0xda,0xff,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[254:255], a[4:5] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06] +0xff,0xff,0xe2,0xda,0x01,0xfe,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06] +0xff,0xff,0xe2,0xda,0x01,0x02,0xfe,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] ; encoding: [0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +0x00,0x00,0xe2,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:4 ; encoding: [0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06] +0x04,0x00,0xe2,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_cmpst_rtn_f64 a[6:7], v1, a[2:3], a[4:5] offset:65535 gds ; encoding: [0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06] +0xff,0xff,0xe3,0xda,0x01,0x02,0x04,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xe4,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_min_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xe4,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xe4,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xe4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xe4,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_min_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xe5,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xe6,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_max_rtn_f64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xe6,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xe6,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xe6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xe6,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_max_rtn_f64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xe7,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_read_b64 a[6:7], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06] +0xff,0xff,0xec,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b64 a[254:255], v1 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe] +0xff,0xff,0xec,0xda,0x01,0x00,0x00,0xfe + +# GFX90A: ds_read_b64 a[6:7], v255 offset:65535 ; encoding: [0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06] +0xff,0xff,0xec,0xda,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read_b64 a[6:7], v1 ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b64 a[6:7], v1 ; encoding: [0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +0x00,0x00,0xec,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b64 a[6:7], v1 offset:4 ; encoding: [0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06] +0x04,0x00,0xec,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b64 a[6:7], v1 offset:65535 gds ; encoding: [0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06] +0xff,0xff,0xed,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc] +0x7f,0xff,0xee,0xda,0x01,0x00,0x00,0xfc + +# GFX90A: ds_read2_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06] +0x7f,0xff,0xee,0xda,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06] +0x10,0xff,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06] +0x7f,0x01,0xee,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0xef,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[252:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc] +0x7f,0xff,0xf0,0xda,0x01,0x00,0x00,0xfc + +# GFX90A: ds_read2st64_b64 a[6:9], v255 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06] +0x7f,0xff,0xf0,0xda,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset1:255 ; encoding: [0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +0x00,0xff,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:16 offset1:255 ; encoding: [0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06] +0x10,0xff,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 ; encoding: [0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06] +0x7f,0x00,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:1 ; encoding: [0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06] +0x7f,0x01,0xf0,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read2st64_b64 a[6:9], v1 offset0:127 offset1:255 gds ; encoding: [0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06] +0x7f,0xff,0xf1,0xda,0x01,0x00,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[254:255], v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe] +0xff,0xff,0xfc,0xda,0x01,0x02,0x00,0xfe + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v255, a[2:3] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06] +0xff,0xff,0xfc,0xda,0xff,0x02,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[254:255] offset:65535 ; encoding: [0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06] +0xff,0xff,0xfc,0xda,0x01,0xfe,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] ; encoding: [0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +0x00,0x00,0xfc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:4 ; encoding: [0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06] +0x04,0x00,0xfc,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_condxchg32_rtn_b64 a[6:7], v1, a[2:3] offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06] +0xff,0xff,0xfd,0xda,0x01,0x02,0x00,0x06 + +# GFX90A: ds_gws_init a1 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00] +0xff,0xff,0x33,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_init a255 offset:65535 gds ; encoding: [0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00] +0xff,0xff,0x33,0xdb,0xff,0x00,0x00,0x00 + +# GFX90A: ds_gws_init a1 gds ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_init a1 gds ; encoding: [0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x33,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_init a1 offset:4 gds ; encoding: [0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00] +0x04,0x00,0x33,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_sema_br a1 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00] +0xff,0xff,0x37,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_sema_br a255 offset:65535 gds ; encoding: [0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00] +0xff,0xff,0x37,0xdb,0xff,0x00,0x00,0x00 + +# GFX90A: ds_gws_sema_br a1 gds ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_sema_br a1 gds ; encoding: [0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x37,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_sema_br a1 offset:4 gds ; encoding: [0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00] +0x04,0x00,0x37,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_barrier a1 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00] +0xff,0xff,0x3b,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_barrier a255 offset:65535 gds ; encoding: [0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00] +0xff,0xff,0x3b,0xdb,0xff,0x00,0x00,0x00 + +# GFX90A: ds_gws_barrier a1 gds ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_barrier a1 gds ; encoding: [0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +0x00,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_gws_barrier a1 offset:4 gds ; encoding: [0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00] +0x04,0x00,0x3b,0xdb,0x01,0x00,0x00,0x00 + +# GFX90A: ds_consume a5 offset:65535 ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05] +0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_consume a255 offset:65535 ; encoding: [0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff] +0xff,0xff,0x7a,0xdb,0x00,0x00,0x00,0xff + +# GFX90A: ds_consume a5 ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_consume a5 ; encoding: [0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +0x00,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_consume a5 offset:4 ; encoding: [0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05] +0x04,0x00,0x7a,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_consume a5 offset:65535 gds ; encoding: [0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05] +0xff,0xff,0x7b,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_append a5 offset:65535 ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05] +0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_append a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff] +0xff,0xff,0x7c,0xdb,0x00,0x00,0x00,0xff + +# GFX90A: ds_append a5 ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_append a5 ; encoding: [0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +0x00,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_append a5 offset:4 ; encoding: [0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05] +0x04,0x00,0x7c,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_append a5 offset:65535 gds ; encoding: [0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05] +0xff,0xff,0x7d,0xdb,0x00,0x00,0x00,0x05 + +# GFX90A: ds_ordered_count a5, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05] +0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0x05 + +# GFX90A: ds_ordered_count a255, v1 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff] +0xff,0xff,0x7f,0xdb,0x01,0x00,0x00,0xff + +# GFX90A: ds_ordered_count a5, v255 offset:65535 gds ; encoding: [0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05] +0xff,0xff,0x7f,0xdb,0xff,0x00,0x00,0x05 + +# GFX90A: ds_ordered_count a5, v1 gds ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05 + +# GFX90A: ds_ordered_count a5, v1 gds ; encoding: [0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +0x00,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05 + +# GFX90A: ds_ordered_count a5, v1 offset:4 gds ; encoding: [0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05] +0x04,0x00,0x7f,0xdb,0x01,0x00,0x00,0x05 + +# GFX90A: ds_write_b96 v1, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00] +0xff,0xff,0xbc,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b96 v255, a[2:4] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00] +0xff,0xff,0xbc,0xdb,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b96 v1, a[252:254] offset:65535 ; encoding: [0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00] +0xff,0xff,0xbc,0xdb,0x01,0xfc,0x00,0x00 + +# GFX90A: ds_write_b96 v1, a[2:4] ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b96 v1, a[2:4] ; encoding: [0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +0x00,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b96 v1, a[2:4] offset:4 ; encoding: [0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00] +0x04,0x00,0xbc,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b96 v1, a[2:4] offset:65535 gds ; encoding: [0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00] +0xff,0xff,0xbd,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00] +0xff,0xff,0xbe,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v255, a[2:5] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00] +0xff,0xff,0xbe,0xdb,0xff,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[252:255] offset:65535 ; encoding: [0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00] +0xff,0xff,0xbe,0xdb,0x01,0xfc,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[2:5] ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[2:5] ; encoding: [0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +0x00,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[2:5] offset:4 ; encoding: [0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00] +0x04,0x00,0xbe,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_write_b128 v1, a[2:5] offset:65535 gds ; encoding: [0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00] +0xff,0xff,0xbf,0xdb,0x01,0x02,0x00,0x00 + +# GFX90A: ds_read_b96 a[6:8], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06] +0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b96 a[252:254], v1 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc] +0xff,0xff,0xfc,0xdb,0x01,0x00,0x00,0xfc + +# GFX90A: ds_read_b96 a[6:8], v255 offset:65535 ; encoding: [0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06] +0xff,0xff,0xfc,0xdb,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read_b96 a[6:8], v1 ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b96 a[6:8], v1 ; encoding: [0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +0x00,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b96 a[6:8], v1 offset:4 ; encoding: [0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06] +0x04,0x00,0xfc,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b96 a[6:8], v1 offset:65535 gds ; encoding: [0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06] +0xff,0xff,0xfd,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[6:9], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06] +0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[252:255], v1 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc] +0xff,0xff,0xfe,0xdb,0x01,0x00,0x00,0xfc + +# GFX90A: ds_read_b128 a[6:9], v255 offset:65535 ; encoding: [0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06] +0xff,0xff,0xfe,0xdb,0xff,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[6:9], v1 ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[6:9], v1 ; encoding: [0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +0x00,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[6:9], v1 offset:4 ; encoding: [0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06] +0x04,0x00,0xfe,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: ds_read_b128 a[6:9], v1 offset:65535 gds ; encoding: [0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06] +0xff,0xff,0xff,0xdb,0x01,0x00,0x00,0x06 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a252, v2, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x01,0x01,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_load a5, v252, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x01,0x01,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[12:19] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00] +0x00,0x01,0x01,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_load a5, v2, s[92:99] dmask:0x1 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00] +0x00,0x01,0x01,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a6, v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00] +0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x21,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 slc ; encoding: [0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00] +0x00,0x01,0x01,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00] +0x00,0x01,0x03,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 da ; encoding: [0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00] +0x00,0x41,0x01,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_load a5, v2, s[8:15] dmask:0x1 d16 ; encoding: [0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80] +0x00,0x01,0x01,0xf0,0x02,0x05,0x02,0x80 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a252, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00] +0x00,0x11,0x21,0xf0,0x02,0xfc,0x03,0x00 + +# GFX90A: image_store a1, v252, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00] +0x00,0x11,0x21,0xf0,0xfc,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[16:23] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00] +0x00,0x11,0x21,0xf0,0x02,0x01,0x04,0x00 + +# GFX90A: image_store a1, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00] +0x00,0x11,0x21,0xf0,0x02,0x01,0x17,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a2, v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00] +0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x31,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00] +0x00,0x11,0x21,0xf2,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00] +0x00,0x11,0x23,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00] +0x00,0x51,0x21,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store a1, v2, s[12:19] dmask:0x1 unorm d16 ; encoding: [0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80] +0x00,0x11,0x21,0xf0,0x02,0x01,0x03,0x80 + +# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x41,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_swap a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x41,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x41,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x41,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_swap a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x41,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x43,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x41,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x45,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[252:253], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x13,0x45,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v252, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00] +0x00,0x13,0x45,0xf0,0xfc,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00] +0x00,0x13,0x45,0xf0,0x02,0x06,0x03,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00] +0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00] +0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00] +0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm slc ; encoding: [0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00] +0x00,0x13,0x45,0xf2,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm lwe ; encoding: [0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x47,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm da ; encoding: [0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00] +0x00,0x53,0x45,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x49,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_add a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x49,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_add a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x49,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x49,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_add a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_add a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x49,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x4b,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x49,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x4d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x4d,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x4d,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x4d,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_sub a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x4d,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x4f,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x4d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x51,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x51,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x51,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x51,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_smin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x51,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x53,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x51,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x55,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x55,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x55,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x55,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_umin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x55,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x57,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x55,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x59,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x59,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x59,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x59,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_smax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x59,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x5b,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x59,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x5d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x5d,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x5d,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x5d,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_umax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x5d,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x5f,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x5d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x61,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x61,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_and a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x61,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x61,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_and a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_and a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x61,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x63,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x61,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x65,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x65,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_or a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x65,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x65,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_or a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_or a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x65,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x67,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x65,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x69,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x69,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x69,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x69,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_xor a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x69,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x6b,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x69,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x6d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x6d,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x6d,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x6d,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_inc a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x6d,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x6f,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x6d,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x71,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a252, v2, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00] +0x00,0x11,0x71,0xf0,0x02,0xfc,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v252, s[8:15] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00] +0x00,0x11,0x71,0xf0,0xfc,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00] +0x00,0x11,0x71,0xf0,0x02,0x05,0x03,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00] +0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00 + +# GFX90A: image_atomic_dec a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00] +0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00] +0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm slc ; encoding: [0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00] +0x00,0x11,0x71,0xf2,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm lwe ; encoding: [0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00] +0x00,0x11,0x73,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00] +0x00,0x51,0x71,0xf0,0x02,0x05,0x02,0x00 + +# GFX90A: image_sample a5, v0, s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00] +0x00,0x01,0x81,0xf0,0x00,0x05,0x62,0x00 Index: llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AMDGPU/mai-gfx90a.txt @@ -0,0 +1,2512 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -show-encoding -disassemble %s | FileCheck -check-prefix=GFX90A %s + +# GFX90A: v_accvgpr_read_b32 v2, a0 ; encoding: [0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18] +0x02,0x40,0xd8,0xd3,0x00,0x01,0x00,0x18 + +# GFX90A: v_accvgpr_read_b32 v2, a1 ; encoding: [0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18] +0x02,0x40,0xd8,0xd3,0x01,0x01,0x00,0x18 + +# GFX90A: v_accvgpr_read_b32 v2, a255 ; encoding: [0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18] +0x02,0x40,0xd8,0xd3,0xff,0x01,0x00,0x18 + +# GFX90A: v_accvgpr_write_b32 a2, -2.0 ; encoding: [0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18] +0x02,0x40,0xd9,0xd3,0xf5,0x00,0x00,0x18 + +# GFX90A: v_accvgpr_write_b32 a2, -2 ; encoding: [0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18] +0x02,0x40,0xd9,0xd3,0xc2,0x00,0x00,0x18 + +# GFX90A: v_accvgpr_write_b32 a2, v1 ; encoding: [0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18] +0x02,0x40,0xd9,0xd3,0x01,0x01,0x00,0x18 + +# GFX90A: v_accvgpr_mov_b32 a1, a2 ; encoding: [0x02,0xa5,0x02,0x7e] +0x02,0xa5,0x02,0x7e + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xc0,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xc0,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xc0,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xc0,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xc0,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xc0,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x1f32 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xc0,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x1f32 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xc0,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xc1,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xc1,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xc1,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xc1,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xc1,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xc1,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x1f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xc1,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x1f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xc1,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xc2,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xc2,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xc2,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xc2,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xc2,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xc2,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x1f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xc2,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x1f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xc2,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xc4,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xc4,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xc4,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xc4,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xc4,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xc4,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x2f32 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xc4,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x2f32 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xc4,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xc5,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xc5,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xc5,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xc5,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xc5,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xc5,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4f32 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xc5,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4f32 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xc5,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xc8,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xc8,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xc8,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xc8,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xc8,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xc8,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4f16 a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xc8,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4f16 v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xc8,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xc9,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xc9,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xc9,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xc9,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xc9,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xc9,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xc9,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xc9,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xca,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xca,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xca,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xca,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xca,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xca,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x4f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xca,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x4f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xca,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xcc,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xcc,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xcc,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xcc,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xcc,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xcc,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x8f16 a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xcc,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x8f16 v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xcc,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xcd,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xcd,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xcd,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xcd,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xcd,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xcd,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x16f16 a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xcd,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x16f16 v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xcd,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 ; encoding: [0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x80,0xd0,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 ; encoding: [0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x00,0xd0,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_32x32x4i8 a[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x93,0xd0,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_32x32x4i8 v[0:31], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x13,0xd0,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x80,0xd1,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x00,0xd1,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_16x16x4i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x93,0xd1,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_16x16x4i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x13,0xd1,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x80,0xd2,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x00,0xd2,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_4x4x4i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x93,0xd2,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_4x4x4i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x13,0xd2,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x80,0xd4,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x00,0xd4,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_32x32x8i8 a[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x93,0xd4,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_32x32x8i8 v[0:15], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x13,0xd4,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x02 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xe2 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x12 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], v0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xf2 + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x0a + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, v1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xea + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x80,0xd5,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a] +0x00,0x00,0xd5,0xd3,0x00,0x03,0x0a,0x1a + +# GFX90A: v_mfma_i32_16x16x16i8 a[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x93,0xd5,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_i32_16x16x16i8 v[0:3], a0, a1, 2 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa] +0x00,0x13,0xd5,0xd3,0x00,0x03,0x0a,0xfa + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xe8,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xe8,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xe8,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xe8,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xe8,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xe8,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x2bf16 a[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xe8,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x2bf16 v[0:31], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xe8,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xe9,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xe9,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xe9,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xe9,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xe9,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xe9,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x2bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xe9,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x2bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xe9,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xeb,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xeb,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xeb,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xeb,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xeb,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xeb,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x2bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xeb,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x2bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xeb,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xec,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xec,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xec,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xec,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 ; encoding: [0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xec,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 ; encoding: [0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xec,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4bf16 a[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xec,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4bf16 v[0:15], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xec,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x80,0xed,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x93,0xed,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04] +0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4] +0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14] +0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4] +0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c] +0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec] +0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c] +0x00,0x00,0xed,0xd3,0x00,0x03,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc] +0x00,0x13,0xed,0xd3,0x00,0x03,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03] +0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3] +0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13] +0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], v0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3] +0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b] +0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, v1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb] +0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 ; encoding: [0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x80,0xed,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 ; encoding: [0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b] +0x00,0x00,0xed,0xd3,0x00,0x03,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x8bf16 a[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x93,0xed,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x8bf16 v[0:3], a0, a1, -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb] +0x00,0x13,0xed,0xd3,0x00,0x03,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xe3,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], a[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xe3,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xe3,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], v[2:33] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xe3,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xe3,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xe3,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x4bf16_1k a[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xe3,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x4bf16_1k v[0:31], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xe3,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xe4,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xe4,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xe4,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xe4,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xe4,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xe4,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x4bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xe4,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x4bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xe4,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xe5,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xe5,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xe5,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xe5,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xe5,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xe5,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_4x4x4bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xe5,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_4x4x4bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xe5,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xe6,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], a[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xe6,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xe6,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], v[2:17] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xe6,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xe6,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xe6,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_32x32x8bf16_1k a[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xe6,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_32x32x8bf16_1k v[0:15], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xe6,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x80,0xe7,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], a[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x93,0xe7,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14] +0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x14 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec] +0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xec + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c] +0x00,0x00,0xe7,0xd3,0x00,0x05,0x0a,0x1c + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], v[2:5] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc] +0x00,0x13,0xe7,0xd3,0x00,0x05,0x0a,0xfc + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13] +0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x13 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], v[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3] +0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xf3 + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b] +0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x0b + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb] +0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xeb + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xe7,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xe7,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f32_16x16x16bf16_1k a[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x93,0xe7,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f32_16x16x16bf16_1k v[0:3], a[0:1], a[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb] +0x00,0x13,0xe7,0xd3,0x00,0x05,0xd6,0xfb + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xee,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x13,0xef,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], v[2:3], v[2:9] ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xee,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], v[0:1], a[2:3], v[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xee,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f64_16x16x4f64 v[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xee,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], v[2:3], v[2:3] ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x00,0xef,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], a[2:3], v[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x13,0xef,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f64_4x4x4f64 v[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x00,0xef,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xee,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04] +0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x04 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4] +0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xe4 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03] +0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x03 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02] +0x00,0x80,0xef,0xd3,0x00,0x05,0x02,0x02 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], -2.0 cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3] +0x00,0x93,0xef,0xd3,0x00,0x05,0xd6,0xe3 + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], v[2:3], a[2:9] ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xee,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], a[2:3], a[2:9] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xee,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xee,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], v[2:3], a[2:3] ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c] +0x00,0x80,0xef,0xd3,0x00,0x05,0x0a,0x0c + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], a[2:3], a[2:3] cbsz:3 abid:2 blgp:7 ; encoding: [0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4] +0x00,0x93,0xef,0xd3,0x00,0x05,0x0a,0xf4 + +# GFX90A: v_mfma_f64_4x4x4f64 a[0:1], a[0:1], a[2:3], -2.0 ; encoding: [0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b] +0x00,0x80,0xef,0xd3,0x00,0x05,0xd6,0x1b + +# GFX90A: v_mfma_f64_16x16x4f64 a[0:7], a[0:1], a[2:3], 0 ; encoding: [0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a] +0x00,0x80,0xee,0xd3,0x00,0x05,0x02,0x1a Index: llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt @@ -0,0 +1,76 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A + +# GFX90A: image_load v4, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00] +0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00 + +# GFX90A: image_load_pck v5, v0, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00] +0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00 + +# GFX90A: image_load_pck_sgn v5, v0, s[8:15] dmask:0x1 lwe ; encoding: [0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00] +0x00,0x01,0x0e,0xf0,0x00,0x05,0x02,0x00 + +# GFX90A: image_load_mip v5, v0, s[8:15] ; encoding: [0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00] +0x00,0x00,0x04,0xf0,0x00,0x05,0x02,0x00 + +# GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00] +0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00 + +# GFX90A: image_load_mip_pck_sgn v4, v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00] +0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00 + +# GFX90A: image_store v192, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00] +0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00 + +# GFX90A: image_store_pck v1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00] +0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store_mip v1, v2, s[12:19] ; encoding: [0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00] +0x00,0x00,0x24,0xf0,0x02,0x01,0x03,0x00 + +# GFX90A: image_store_mip_pck v252, v2, s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00] +0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00 + +# GFX90A: image_atomic_add v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x31,0x48,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_and v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x60,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_swap v4, v192, s[28:35] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x31,0x40,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_cmpswap v[4:5], v192, s[28:35] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x33,0x44,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_or v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x64,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_xor v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x68,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_sub v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x4c,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_smin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x50,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_smax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x58,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_umin v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x54,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_umax v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x5c,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_inc v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x6c,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_atomic_dec v4, v192, s[28:35] dmask:0x1 unorm ; encoding: [0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00] +0x00,0x11,0x70,0xf0,0xc0,0x04,0x07,0x00 + +# GFX90A: image_get_resinfo v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00] +0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00 + +0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00 +# GFX90A: image_sample v5, v0, s[8:15], s[12:15] dmask:0x1 ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x05,0x62,0x00] Index: llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml =================================================================== --- llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -142,6 +142,10 @@ # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX909 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX909 %s # RUN: obj2yaml %t.o.AMDGCN_GFX909 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX909 %s +# RUN: sed -e 's//64/' -e 's//AMDGCN_GFX90A/' %s | yaml2obj -o %t.o.AMDGCN_GFX90A +# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX90A | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX90A %s +# RUN: obj2yaml %t.o.AMDGCN_GFX90A | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX90A %s + # RUN: sed -e 's//64/' -e 's//AMDGCN_GFX90C/' %s | yaml2obj -o %t.o.AMDGCN_GFX90C # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX90C | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX90C %s # RUN: obj2yaml %t.o.AMDGCN_GFX90C | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX90C %s @@ -300,6 +304,9 @@ # ELF-AMDGCN-GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) # YAML-AMDGCN-GFX909: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX909 ] +# ELF-AMDGCN-GFX90A: EF_AMDGPU_MACH_AMDGCN_GFX90A (0x3F) +# YAML-AMDGCN-GFX90A: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX90A ] + # ELF-AMDGCN-GFX90C: EF_AMDGPU_MACH_AMDGCN_GFX90C (0x32) # YAML-AMDGCN-GFX90C: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX90C ] Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f32.ll @@ -0,0 +1,66 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX908 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -slp-vectorizer < %s | FileCheck -check-prefixes=GCN,GFX90A %s + +; GCN-LABEL: @fadd_combine +; GFX908: fadd float +; GFX908: fadd float +; GFX90A: fadd <2 x float> +define amdgpu_kernel void @fadd_combine(float addrspace(1)* %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = zext i32 %tmp to i64 + %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1 + %tmp3 = load float, float addrspace(1)* %tmp2, align 4 + %tmp4 = fadd float %tmp3, 1.000000e+00 + store float %tmp4, float addrspace(1)* %tmp2, align 4 + %tmp5 = add nuw nsw i64 %tmp1, 1 + %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5 + %tmp7 = load float, float addrspace(1)* %tmp6, align 4 + %tmp8 = fadd float %tmp7, 1.000000e+00 + store float %tmp8, float addrspace(1)* %tmp6, align 4 + ret void +} + +; GCN-LABEL: @fmul_combine +; GFX908: fmul float +; GFX908: fmul float +; GFX90A: fmul <2 x float> +define amdgpu_kernel void @fmul_combine(float addrspace(1)* %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = zext i32 %tmp to i64 + %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1 + %tmp3 = load float, float addrspace(1)* %tmp2, align 4 + %tmp4 = fmul float %tmp3, 1.000000e+00 + store float %tmp4, float addrspace(1)* %tmp2, align 4 + %tmp5 = add nuw nsw i64 %tmp1, 1 + %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5 + %tmp7 = load float, float addrspace(1)* %tmp6, align 4 + %tmp8 = fmul float %tmp7, 1.000000e+00 + store float %tmp8, float addrspace(1)* %tmp6, align 4 + ret void +} + +; GCN-LABEL: @fma_combine +; GFX908: call float @llvm.fma.f32 +; GFX908: call float @llvm.fma.f32 +; GFX90A: call <2 x float> @llvm.fma.v2f32 +define amdgpu_kernel void @fma_combine(float addrspace(1)* %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = zext i32 %tmp to i64 + %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp1 + %tmp3 = load float, float addrspace(1)* %tmp2, align 4 + %tmp4 = tail call float @llvm.fma.f32(float %tmp3, float 1.000000e+00, float 1.000000e+00) + store float %tmp4, float addrspace(1)* %tmp2, align 4 + %tmp5 = add nuw nsw i64 %tmp1, 1 + %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp5 + %tmp7 = load float, float addrspace(1)* %tmp6, align 4 + %tmp8 = tail call float @llvm.fma.f32(float %tmp7, float 1.000000e+00, float 1.000000e+00) + store float %tmp8, float addrspace(1)* %tmp6, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare float @llvm.fma.f32(float, float, float) + Index: llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test =================================================================== --- llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test +++ llvm/test/tools/llvm-readobj/ELF/amdgpu-elf-headers.test @@ -58,6 +58,9 @@ # RUN: yaml2obj %s -o %t -DCPU=GFX909 # RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX909 -DFLAGS=0x31 +# RUN: yaml2obj %s -o %t -DCPU=GFX90A +# RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX90A -DFLAGS=0x3F + # RUN: yaml2obj %s -o %t -DCPU=GFX90C # RUN: llvm-readobj -h %t | FileCheck %s --match-full-lines -DFILE=%t -DCPU=GFX90C -DFLAGS=0x32 Index: llvm/tools/llvm-readobj/ELFDumper.cpp =================================================================== --- llvm/tools/llvm-readobj/ELFDumper.cpp +++ llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1466,6 +1466,7 @@ LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX908), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909), + LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90A), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX90C), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),