diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -222,12 +222,17 @@ //===----------------------------------------------------------------------===// TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot7-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot8-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot8-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot8-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts") TARGET_BUILTIN(__builtin_amdgcn_udot4, "UiUiUiUiIb", "nc", "dot7-insts") +TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", "nc", "dot8-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts") TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts") +TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts") //===----------------------------------------------------------------------===// // GFX10+ only builtins. diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -8,29 +8,45 @@ typedef short __attribute__((ext_vector_type(2))) short2; typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; +#pragma OPENCL EXTENSION cl_khr_fp16 : enable kernel void builtins_amdgcn_dl_insts_err( global float *fOut, global int *siOut, global uint *uiOut, - half2 v2hA, half2 v2hB, float fC, - short2 v2ssA, short2 v2ssB, int siA, int siB, int siC, - ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC) { - fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} - fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} + global short *sOut, global int *iOut, global half *hOut, + half2 v2hA, half2 v2hB, float fC, half hC, + short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC, + ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC, + int A, int B, int C) { + fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} + fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2' needs target feature dot7-insts}} - siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} - siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} + hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC); // expected-error {{'__builtin_amdgcn_fdot2_f16_f16' needs target feature dot8-insts}} - uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}} - uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}} + sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot8-insts}} - siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} - siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} + fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}} + fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot8-insts}} - uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} - uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} + siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} + siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} - siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} - siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} + uiOut[0] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, false); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}} + uiOut[1] = __builtin_amdgcn_udot2(v2usA, v2usB, uiC, true); // expected-error {{'__builtin_amdgcn_udot2' needs target feature dot2-insts}} - uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} - uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} + siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} + siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot4' needs target feature dot1-insts}} + + uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} + uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot4' needs target feature dot7-insts}} + + iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}} + iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true); // expected-error {{'__builtin_amdgcn_sudot4' needs target feature dot8-insts}} + + siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} + siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); // expected-error {{'__builtin_amdgcn_sdot8' needs target feature dot1-insts}} + + uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} + uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); // expected-error {{'__builtin_amdgcn_udot8' needs target feature dot7-insts}} + + iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} + iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); // expected-error {{'__builtin_amdgcn_sudot8' needs target feature dot8-insts}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl @@ -0,0 +1,64 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -emit-llvm -o - %s | FileCheck %s + +typedef unsigned int uint; +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; +typedef unsigned short __attribute__((ext_vector_type(2))) ushort2; + +// CHECK-LABEL: @builtins_amdgcn_dl_insts +// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 false) +// CHECK: call float @llvm.amdgcn.fdot2(<2 x half> %v2hA, <2 x half> %v2hB, float %fC, i1 true) +// CHECK: call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %v2hA, <2 x half> %v2hB, half %hC) +// CHECK: call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, i16 %sC) +// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 false) +// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> %v2ssB, float %fC, i1 true) +// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 false) +// CHECK: call i32 @llvm.amdgcn.sdot4(i32 %siA, i32 %siB, i32 %siC, i1 true) +// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) +// CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true) +// CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 false) +// CHECK: call i32 @llvm.amdgcn.sudot4(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 true) +// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 false) +// CHECK: call i32 @llvm.amdgcn.sdot8(i32 %siA, i32 %siB, i32 %siC, i1 true) +// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 false) +// CHECK: call i32 @llvm.amdgcn.udot8(i32 %uiA, i32 %uiB, i32 %uiC, i1 true) +// CHECK: call i32 @llvm.amdgcn.sudot8(i1 false, i32 %A, i1 true, i32 %B, i32 %C, i1 false) +// CHECK: call i32 @llvm.amdgcn.sudot8(i1 true, i32 %A, i1 false, i32 %B, i32 %C, i1 true) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +kernel void builtins_amdgcn_dl_insts_err( + global float *fOut, global int *siOut, global uint *uiOut, + global short *sOut, global int *iOut, global half *hOut, + half2 v2hA, half2 v2hB, float fC, half hC, + short2 v2ssA, short2 v2ssB, short sC, int siA, int siB, int siC, + ushort2 v2usA, ushort2 v2usB, uint uiA, uint uiB, uint uiC, + int A, int B, int C) { + fOut[0] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, false); + fOut[1] = __builtin_amdgcn_fdot2(v2hA, v2hB, fC, true); + + hOut[0] = __builtin_amdgcn_fdot2_f16_f16(v2hA, v2hB, hC); + + sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); + + fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); + fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); + + siOut[2] = __builtin_amdgcn_sdot4(siA, siB, siC, false); + siOut[3] = __builtin_amdgcn_sdot4(siA, siB, siC, true); + + uiOut[2] = __builtin_amdgcn_udot4(uiA, uiB, uiC, false); + uiOut[3] = __builtin_amdgcn_udot4(uiA, uiB, uiC, true); + + iOut[0] = __builtin_amdgcn_sudot4(true, A, false, B, C, false); + iOut[1] = __builtin_amdgcn_sudot4(false, A, true, B, C, true); + + siOut[4] = __builtin_amdgcn_sdot8(siA, siB, siC, false); + siOut[5] = __builtin_amdgcn_sdot8(siA, siB, siC, true); + + uiOut[4] = __builtin_amdgcn_udot8(uiA, uiB, uiC, false); + uiOut[5] = __builtin_amdgcn_udot8(uiA, uiB, uiC, true); + + iOut[3] = __builtin_amdgcn_sudot8(false, A, true, B, C, false); + iOut[4] = __builtin_amdgcn_sudot8(true, A, false, B, C, true); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1923,6 +1923,49 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>] >; +// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_fdot2_f16_f16 : + GCCBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, + Intrinsic< + [llvm_half_ty], // %r + [ + llvm_v2f16_ty, // %a + llvm_v2f16_ty, // %b + llvm_half_ty // %c + ], + [IntrNoMem, IntrSpeculatable, IntrWillReturn] + >; + +// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_fdot2_bf16_bf16 : + GCCBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, + Intrinsic< + [llvm_i16_ty], // %r + [ + llvm_v2i16_ty, // %a + llvm_v2i16_ty, // %b + llvm_i16_ty // %c + ], + [IntrNoMem, IntrSpeculatable, IntrWillReturn] + >; + +// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c +def int_amdgcn_fdot2_f32_bf16 : + GCCBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, + Intrinsic< + [llvm_float_ty], // %r + [ + llvm_v2i16_ty, // %a + llvm_v2i16_ty, // %b + llvm_float_ty, // %c + llvm_i1_ty // %clamp + ], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>] + >; + // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c def int_amdgcn_sdot2 : @@ -1983,6 +2026,27 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>] >; +// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) +// Treat input as signed (_sign = 1) or unsigned (_sign = 0). +// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); +// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c +def int_amdgcn_sudot4 : + GCCBuiltin<"__builtin_amdgcn_sudot4">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i1_ty, // %a_sign + llvm_i32_ty, // %a + llvm_i1_ty, // %b_sign + llvm_i32_ty, // %b + llvm_i32_ty, // %c + llvm_i1_ty // %clamp + ], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, + ImmArg>, ImmArg>, ImmArg>] + >; + // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c @@ -2015,6 +2079,28 @@ [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>] >; +// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) +// Treat input as signed (_sign = 1) or unsigned (_sign = 0). +// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); +// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); +// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + +// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c + def int_amdgcn_sudot8 : + GCCBuiltin<"__builtin_amdgcn_sudot8">, + Intrinsic< + [llvm_i32_ty], // %r + [ + llvm_i1_ty, // %a_sign + llvm_i32_ty, // %a + llvm_i1_ty, // %b_sign + llvm_i32_ty, // %b + llvm_i32_ty, // %c + llvm_i1_ty // %clamp + ], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, + ImmArg>, ImmArg>, ImmArg>] + >; + //===----------------------------------------------------------------------===// // gfx908 intrinsics // ===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -51,6 +51,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_dotiuvop3pmods : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -225,6 +225,8 @@ bool IsDOT = false) const; bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2741,6 +2741,21 @@ return SelectVOP3PMods(In, Src, SrcMods, true); } +bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast(In); + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // 1 promotes packed values to signed, 0 treats them as unsigned. + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcSign = C->getAPIntValue().getZExtValue(); + if (SrcSign == 1) + Mods ^= SISrcMods::NEG; + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -185,6 +185,9 @@ InstructionSelector::ComplexRendererFns selectVOP3PModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectDotIUVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3664,6 +3664,21 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= SISrcMods::NEG; + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4249,6 +4249,11 @@ case Intrinsic::amdgcn_udot4: case Intrinsic::amdgcn_sdot8: case Intrinsic::amdgcn_udot8: + case Intrinsic::amdgcn_fdot2_bf16_bf16: + case Intrinsic::amdgcn_fdot2_f16_f16: + case Intrinsic::amdgcn_fdot2_f32_bf16: + case Intrinsic::amdgcn_sudot4: + case Intrinsic::amdgcn_sudot8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_sbfe: case Intrinsic::amdgcn_ubfe: diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -724,8 +724,8 @@ } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = HasDot8Insts in { - defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile>; - defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile>; + defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile, int_amdgcn_fdot2_f16_f16>; + defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile, int_amdgcn_fdot2_bf16_bf16>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -359,7 +359,7 @@ defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", VOP3P_Profile, - null_frag, 1>; + int_amdgcn_fdot2_f32_bf16, 1>; } // End SubtargetPredicate = HasDot8Insts @@ -381,8 +381,8 @@ } let SubtargetPredicate = HasDot8Insts in { -defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", null_frag>; -defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", null_frag>; +defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; +defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; } // End SubtargetPredicate = HasDot8Insts def : UDot2Pat; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX906 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10PLUS %s define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) { ; GFX906-LABEL: v_fdot2: @@ -10,12 +11,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false) ret float %r } @@ -27,12 +28,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_clamp: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_clamp: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true) ret float %r } @@ -44,12 +45,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_neg_a: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_neg_a: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %a %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false) ret float %r @@ -62,12 +63,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_neg_b: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_neg_b: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false) ret float %r @@ -80,12 +81,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_neg_a_neg_b: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_neg_a_neg_b: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.a = fneg <2 x half> %b %neg.b = fneg <2 x half> %b %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false) @@ -100,13 +101,13 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_neg_c: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_neg_c: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.c = fneg float %c %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false) ret float %r @@ -119,12 +120,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_inline_literal_a: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_inline_literal_a: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> , <2 x half> %b, float %c, i1 false) ret float %ret } @@ -136,12 +137,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_inline_literal_b: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_inline_literal_b: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> , float %c, i1 false) ret float %ret } @@ -153,12 +154,12 @@ ; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdot2_inline_literal_c: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_fdot2_inline_literal_c: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot4.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp) + +define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_us: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_su: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + + + +define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sudot8.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp) + +define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_us: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_su: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + + + +define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a, <2 x i16> %b, i16 %c) + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( +; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + i16 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i16 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i16, i16 addrspace(1)* %c + %r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val, <2 x i16> %b.val, i16 %c.val) + store i16 %r.val, i16 addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( +; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + half addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + half addrspace(1)* %c) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load half, half addrspace(1)* %c + %r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val, <2 x half> %b.val, half %c.val) + store half %r.val, half addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float %c, i1 %clamp) + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( +; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm + float addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + float addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load float, float addrspace(1)* %c + %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 1) + store float %r.val, float addrspace(1)* %r + ret void +} + + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( +; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm + float addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + float addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load float, float addrspace(1)* %c + %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a.val, <2 x i16> %b.val, float %c.val, i1 0) + store float %r.val, float addrspace(1)* %r + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot4.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare i32 @llvm.amdgcn.sudot4(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp) + +define i32 @test_llvm_amdgcn_sudot4_uu(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_us(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_us: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_su(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_su: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_ss(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x16,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + + + +define i32 @test_llvm_amdgcn_sudot4_uu_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_uu_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_us_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_us_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_su_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_su_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot4_ss_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot4_ss_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x16,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot4(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sudot8.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 + +declare i32 @llvm.amdgcn.sudot8(i1 %asign, i32 %a, i1 %bsign, i32 %b, i32 %c, i1 %clamp) + +define i32 @test_llvm_amdgcn_sudot8_uu(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_us(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_us: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_su(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_su: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_ss(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] ; encoding: [0x00,0x40,0x18,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 0) + ret i32 %ret +} + + + +define i32 @test_llvm_amdgcn_sudot8_uu_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_uu_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x1c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_us_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_us_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[0,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x5c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 0, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_su_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_su_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,0,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x3c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 0, i32 %b, i32 %c, i1 1) + ret i32 %ret +} + +define i32 @test_llvm_amdgcn_sudot8_ss_clamp(i32 %a, i32 %b, i32 %c) { +; GFX11-LABEL: test_llvm_amdgcn_sudot8_ss_clamp: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_dot8_i32_iu4 v0, v0, v1, v2 neg_lo:[1,1,0] clamp ; encoding: [0x00,0xc0,0x18,0xcc,0x00,0x03,0x0a,0x7c] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +entry: + %ret = call i32 @llvm.amdgcn.sudot8(i1 1, i32 %a, i1 1, i32 %b, i32 %c, i1 1) + ret i32 %ret +}