diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1019,6 +1019,59 @@ reduction will be performed using default iterative strategy. Intrinsic is currently only implemented for i32. + llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which + support such instructions. This performs unsigned dot product + with two v2i16 operands, summed with the third i32 operand. The + i1 fourth operand is used to clamp the output. + + llvm.amdgcn.udot4 Provides direct access to v_dot4_u32_u8 across targets which + support such instructions. This performs unsigned dot product + with two i32 operands (holding a vector of 4 8bit values), summed + with the third i32 operand. The i1 fourth operand is used to clamp + the output. + + llvm.amdgcn.udot8 Provides direct access to v_dot8_u32_u4 across targets which + support such instructions. This performs unsigned dot product + with two i32 operands (holding a vector of 8 4bit values), summed + with the third i32 operand. The i1 fourth operand is used to clamp + the output. + + llvm.amdgcn.sdot2 Provides direct access to v_dot2_i32_i16 across targets which + support such instructions. This performs signed dot product + with two v2i16 operands, summed with the third i32 operand. The + i1 fourth operand is used to clamp the output. + + llvm.amdgcn.sdot4 Provides direct access to v_dot4_i32_i8 across targets which + support such instructions. This performs signed dot product + with two i32 operands (holding a vector of 4 8bit values), summed + with the third i32 operand. The i1 fourth operand is used to clamp + the output. + RDNA3 does not offer v_dot4_i32_i8, and rather offers + v_dot4_i32_iu8 which has operands to hold the signedness of the + vector operands. Thus, this intrinsic lowers to the signed version + of this instruction for gfx11 targets. + + llvm.amdgcn.sdot8 Provides direct access to v_dot8_u32_u4 across targets which + support such instructions. This performs signed dot product + with two i32 operands (holding a vector of 8 4bit values), summed + with the third i32 operand. The i1 fourth operand is used to clamp + the output. + RDNA3 does not offer v_dot8_i32_i4, and rather offers + v_dot4_i32_iu4 which has operands to hold the signedness of the + vector operands. Thus, this intrinsic lowers to the signed version + of this instruction for gfx11 targets. + + llvm.amdgcn.sudot4 Provides direct access to v_dot4_i32_iu8 on gfx11 targets. This performs + dot product with two i32 operands (holding a vector of 4 8bit values), summed + with the fifth i32 operand. The i1 sixth operand is used to clamp + the output. The i1s preceding the vector operands decide the signedness. + + llvm.amdgcn.sudot8 Provides direct access to v_dot8_i32_iu4 on gfx11 targets. This performs + dot product with two i32 operands (holding a vector of 8 4bit values), summed + with the fifth i32 operand. The i1 sixth operand is used to clamp + the output. The i1s preceding the vector operands decide the signedness. + + ============================================== ========================================================== .. TODO:: diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -436,6 +436,20 @@ let SubtargetPredicate = HasDot8Insts in { defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; + +def : GCNPat < (int_amdgcn_sdot8 i32:$src0, + i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (V_DOT8_I32_IU4 (i32 9), i32:$src0, + (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) +>; + +def : GCNPat < (int_amdgcn_sdot4 i32:$src0, + i32:$src1, + i32:$src2, (i1 timm:$clamp)), + (V_DOT4_I32_IU8 (i32 9), i32:$src0, + (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp) +>; } // End SubtargetPredicate = HasDot8Insts def : UDot2Pat; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -3,12 +3,14 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp) ; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp ; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX10: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp( ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -28,6 +30,7 @@ ; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp ; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX10: v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GF11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} neg_lo:[1,1,0]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp( ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp) @@ -11,6 +12,7 @@ ; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp( ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -31,6 +33,7 @@ ; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp( ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906 ; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 ; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 @@ -11,6 +12,34 @@ ; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( +; GFX906-LABEL: test_llvm_amdgcn_udot2_clamp: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX906-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX906-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v1, s8 +; GFX906-NEXT: v_mov_b32_e32 v2, s9 +; GFX906-NEXT: v_dot2_u32_u16 v1, s10, v1, v2 clamp +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_endpgm +; +; GFX10-LABEL: test_llvm_amdgcn_udot2_clamp: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s9, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s10, s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_dot2_u32_u16 v0, s9, s10, v0 clamp +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -28,6 +57,34 @@ ; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( +; GFX906-LABEL: test_llvm_amdgcn_udot2_no_clamp: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX906-NEXT: s_load_dword s9, s[6:7], 0x0 +; GFX906-NEXT: s_load_dword s10, s[2:3], 0x0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v1, s8 +; GFX906-NEXT: v_mov_b32_e32 v2, s9 +; GFX906-NEXT: v_dot2_u32_u16 v1, s10, v1, v2 +; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_endpgm +; +; GFX10-LABEL: test_llvm_amdgcn_udot2_no_clamp: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s9, s[2:3], 0x0 +; GFX10-NEXT: s_load_dword s10, s[4:5], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_dot2_u32_u16 v0, s9, s10, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -46,6 +103,46 @@ ; GFX940: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel( +; GFX906-LABEL: test_llvm_amdgcn_udot2_op_sel: +; GFX906: ; %bb.0: ; %entry +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX906-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX906-NEXT: v_mov_b32_e32 v1, 0 +; GFX906-NEXT: s_waitcnt lgkmcnt(0) +; GFX906-NEXT: global_load_dword v0, v0, s[6:7] +; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v0, s2 op_sel:[1,1,0] op_sel_hi:[0,0,1] +; GFX906-NEXT: global_store_dword v1, v0, s[4:5] +; GFX906-NEXT: s_endpgm +; +; GFX940-LABEL: test_llvm_amdgcn_udot2_op_sel: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: global_load_dword v0, v0, s[6:7] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX940-NEXT: v_dot2_u32_u16 v0, v0, v0, s2 +; GFX940-NEXT: s_nop 2 +; GFX940-NEXT: global_store_dword v1, v0, s[4:5] sc0 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: test_llvm_amdgcn_udot2_op_sel: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v0, s0 op_sel:[1,1,0] op_sel_hi:[0,0,1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, i32 %c) { @@ -57,7 +154,10 @@ %b.elt1 = extractelement <2 x i16> %b.val, i32 1 %b0 = insertelement <2 x i16> undef, i16 %b.elt1, i32 0 %b1 = insertelement <2 x i16> %b0, i16 %b.elt0, i32 1 - %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> %b1, i32 %c, i1 0) + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %b1, <2 x i16> %b1, i32 %c, i1 0) store i32 %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX9: {{.*}}