Diff 553560

llvm/docs/AMDGPUUsage.rst

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,013 Lines • ▼ Show 20 Lines	llvm.amdgcn.wave.reduce.umax Performs an arithmetic unsigned max reduction on the unsigned values
Intrinsic takes a hint for reduction strategy using second operand		Intrinsic takes a hint for reduction strategy using second operand
0: Target default preference,		0: Target default preference,
1: `Iterative strategy`, and		1: `Iterative strategy`, and
2: `DPP`.		2: `DPP`.
If target does not support the DPP operations (e.g. gfx6/7),		If target does not support the DPP operations (e.g. gfx6/7),
reduction will be performed using default iterative strategy.		reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.		Intrinsic is currently only implemented for i32.

		llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
		support such instructions. This performs unsigned dot product
		with two v2i16 operands, summed with the third i32 operand. The
		i1 fourth operand is used to clamp the output.

		llvm.amdgcn.udot4 Provides direct access to v_dot4_u32_u8 across targets which
		support such instructions. This performs unsigned dot product
		with two i32 operands (holding a vector of 4 8bit values), summed
		with the third i32 operand. The i1 fourth operand is used to clamp
		the output.

		llvm.amdgcn.udot8 Provides direct access to v_dot8_u32_u4 across targets which
		support such instructions. This performs unsigned dot product
		with two i32 operands (holding a vector of 8 4bit values), summed
		with the third i32 operand. The i1 fourth operand is used to clamp
		the output.

		llvm.amdgcn.sdot2 Provides direct access to v_dot2_i32_i16 across targets which
		support such instructions. This performs signed dot product
		with two v2i16 operands, summed with the third i32 operand. The
		i1 fourth operand is used to clamp the output.
		When applicable (e.g. no clamping), this is lowered into
		v_dot2c_i32_i16 for targets which support it.

		llvm.amdgcn.sdot4 Provides direct access to v_dot4_i32_i8 across targets which
		support such instructions. This performs signed dot product
		with two i32 operands (holding a vector of 4 8bit values), summed
		with the third i32 operand. The i1 fourth operand is used to clamp
		the output.
		When applicable (i.e. no clamping / operand modifiers), this is lowered
		into v_dot4c_i32_i8 for targets which support it.
		RDNA3 does not offer v_dot4_i32_i8, and rather offers
		v_dot4_i32_iu8 which has operands to hold the signedness of the
		vector operands. Thus, this intrinsic lowers to the signed version
		of this instruction for gfx11 targets.

		llvm.amdgcn.sdot8 Provides direct access to v_dot8_u32_u4 across targets which
		support such instructions. This performs signed dot product
		with two i32 operands (holding a vector of 8 4bit values), summed
		with the third i32 operand. The i1 fourth operand is used to clamp
		the output.
		arsenmUnsubmitted Not Done Reply Inline Actions Think this needs a new line separator arsenm: Think this needs a new line separator
		When applicable (i.e. no clamping / operand modifiers), this is lowered
		into v_dot8c_i32_i4 for targets which support it.
		RDNA3 does not offer v_dot8_i32_i4, and rather offers
		v_dot4_i32_iu4 which has operands to hold the signedness of the
		vector operands. Thus, this intrinsic lowers to the signed version
		of this instruction for gfx11 targets.

		llvm.amdgcn.sudot4 Provides direct access to v_dot4_i32_iu8 on gfx11 targets. This performs
		dot product with two i32 operands (holding a vector of 4 8bit values), summed
		with the fifth i32 operand. The i1 sixth operand is used to clamp
		the output. The i1s preceding the vector operands decide the signedness.

		llvm.amdgcn.sudot8 Provides direct access to v_dot8_i32_iu4 on gfx11 targets. This performs
		dot product with two i32 operands (holding a vector of 8 4bit values), summed
		with the fifth i32 operand. The i1 sixth operand is used to clamp
		the output. The i1s preceding the vector operands decide the signedness.


============================================== ==========================================================		============================================== ==========================================================

.. TODO::		.. TODO::

List AMDGPU intrinsics.		List AMDGPU intrinsics.

LLVM IR Attributes		LLVM IR Attributes
------------------		------------------
▲ Show 20 Lines • Show All 14,422 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Show First 20 Lines • Show All 428 Lines • ▼ Show 20 Lines	def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
i32:$src2, (i1 timm:$clamp)),		i32:$src2, (i1 timm:$clamp)),
(!cast<Instruction>(NAME) $src0_mods, i32:$src0,		(!cast<Instruction>(NAME) $src0_mods, i32:$src0,
$src1_mods, i32:$src1,		$src1_mods, i32:$src1,
(i32 8), i32:$src2, i1:$clamp)		(i32 8), i32:$src2, i1:$clamp)
>;		>;
}		}

let SubtargetPredicate = HasDot8Insts in {		let SubtargetPredicate = HasDot8Insts in {
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;		defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;		defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;

		def : GCNPat < (int_amdgcn_sdot8 i32:$src0,
		i32:$src1,
		i32:$src2, (i1 timm:$clamp)),
		(V_DOT8_I32_IU4 (i32 9), i32:$src0,
		(i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
		>;

		def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
		i32:$src1,
		i32:$src2, (i1 timm:$clamp)),
		(V_DOT4_I32_IU8 (i32 9), i32:$src0,
		(i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
		>;
} // End SubtargetPredicate = HasDot8Insts		} // End SubtargetPredicate = HasDot8Insts
		arsenmUnsubmitted Not Done Reply Inline Actions I don't understand how these cases are different, the intrinsic name is just slightly different from the instruction name? arsenm: I don't understand how these cases are different, the intrinsic name is just slightly different…
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions On all other targets with 8bit and 4bit signed dot, we codegen for int_amdgcn_sdot4 and int_amdgcn_sdot8. However, we don't support these on gfx1100 -- instead, gfx100 has int_amdgcn_sUdot4 / int_amdgcn_sUdot8. The result is that users of these intrinsics must always check the target to use the corresponding one (sudot4 for gfx1100, and sdot4 for all others). This removes that responsibility from the user, so they are able to use sdot4 across all targets and generate the corresponding instructions. jrbyrnes: On all other targets with 8bit and 4bit signed dot, we codegen for int_amdgcn_sdot4 and…
		arsenmUnsubmitted Not Done Reply Inline Actions Are there unit tests for these somewhere? I don't really know the full history of these instructions and I'm worried there was some random edge case behavior change arsenm: Are there unit tests for these somewhere? I don't really know the full history of these…
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions Apologies, It is my mistake potentially causing confusion. The main difference between V_DOT4_I32_IU8 on gfx1100 and V_DOT4_I32_I8 on gfx90a (for example), is that V_DOT4_I32_IU8 can be either signed or unsigned depending on NEG bit in operand modifier. This target specific feature is probably why there is special handling. See llvm.amdgcn.sudot4 for unit tests. jrbyrnes: Apologies, It is my mistake potentially causing confusion. The main difference between…
		arsenmUnsubmitted Not Done Reply Inline Actions I mean tests that actually execute, not lit tests arsenm: I mean tests that actually execute, not lit tests
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions So I've tracked down some unit tests. https://github.com/ROCm-Developer-Tools/HIP/blob/b8965f1f3d58d7adf7d702c09e75ebf3dd718f8c/tests/src/deviceLib/hipTestDotFunctions.cpp#L34 These calls are implemented as calls to __ockl_sdot4: https://github.com/ROCm-Developer-Tools/clr/blob/5914ac3c6e9b3848023a7fa25e19e560b1c38541/hipamd/include/hip/amd_detail/amd_math_functions.h#L148C60-L148C60 Which is, in turn, implemented as calls to target specific builtins: https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/46939af92ad91238c878a82aad2220822073ffa1/ockl/src/dots.cl#L124 For gfx1100, this lowers to __builtin_amdgcn_sudot4 builtin. If you want, I can hack a compiler to lower the __builtin_amdgcn_sudot4 into int_amdgcn_sdot4 and find a way to run these tests. jrbyrnes: So I've tracked down some unit tests. https://github.com/ROCm-Developer…
		jrbyrnesAuthorUnsubmitted Done Reply Inline Actions Probably worth mentioning is that I have been validating correctness using CK 8 bit and 16 bit test suite, which -- due to https://reviews.llvm.org/D155995 -- has many existing tests that lower into int_amdgcn_sdot4 for gfx1100. jrbyrnes: Probably worth mentioning is that I have been validating correctness using CK 8 bit and 16 bit…
		arsenmUnsubmitted Not Done Reply Inline Actions So apparently we have overlapping intrinsics. We should probably canonicalize llvm.amdgcn.sudot4 cases representable with sdot/udot in AMDGPUInstCombineIntrinsic arsenm: So apparently we have overlapping intrinsics. We should probably canonicalize llvm.amdgcn.
		arsenmUnsubmitted Not Done Reply Inline Actions Ugh, this test is bad. It barely tests it compiles. Really these should test all the edge cases arsenm: Ugh, this test is bad. It barely tests it compiles. Really these should test all the edge cases
		arsenmUnsubmitted Not Done Reply Inline Actions Not even that, this is barely a front end test. the optimizer can delete most all of this arsenm: Not even that, this is barely a front end test. the optimizer can delete most all of this

def : UDot2Pat<V_DOT2_U32_U16>;		def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;		def : SDot2Pat<V_DOT2_I32_I16>;

foreach Type = ["U", "I"] in		foreach Type = ["U", "I"] in
let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in		let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in
def : GCNPat <		def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,		!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
▲ Show 20 Lines • Show All 827 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll

	; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX906			; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX906
	; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GFX11

	declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)			declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)

	; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp			; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
	; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}			; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
	; GFX10: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}			; GFX10: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
				; GFX11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}}
	define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(			define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
	ptr addrspace(1) %r,			ptr addrspace(1) %r,
	ptr addrspace(1) %a,			ptr addrspace(1) %a,
	ptr addrspace(1) %b,			ptr addrspace(1) %b,
	ptr addrspace(1) %c) {			ptr addrspace(1) %c) {
	entry:			entry:
	%a.val = load <4 x i8>, ptr addrspace(1) %a			%a.val = load <4 x i8>, ptr addrspace(1) %a
	%b.val = load <4 x i8>, ptr addrspace(1) %b			%b.val = load <4 x i8>, ptr addrspace(1) %b
	%a.val.cast = bitcast <4 x i8> %a.val to i32			%a.val.cast = bitcast <4 x i8> %a.val to i32
	%b.val.cast = bitcast <4 x i8> %b.val to i32			%b.val.cast = bitcast <4 x i8> %b.val to i32
	%c.val = load i32, ptr addrspace(1) %c			%c.val = load i32, ptr addrspace(1) %c
	%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)			%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
	store i32 %r.val, ptr addrspace(1) %r			store i32 %r.val, ptr addrspace(1) %r
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp			; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
	; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}			; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
	; GFX10: v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}			; GFX10: v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
				; GF11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} neg_lo:[1,1,0]{{$}}
	define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(			define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
	ptr addrspace(1) %r,			ptr addrspace(1) %r,
	ptr addrspace(1) %a,			ptr addrspace(1) %a,
	ptr addrspace(1) %b,			ptr addrspace(1) %b,
	ptr addrspace(1) %c) {			ptr addrspace(1) %c) {
	entry:			entry:
	%a.val = load <4 x i8>, ptr addrspace(1) %a			%a.val = load <4 x i8>, ptr addrspace(1) %a
	%b.val = load <4 x i8>, ptr addrspace(1) %b			%b.val = load <4 x i8>, ptr addrspace(1) %b
	%a.val.cast = bitcast <4 x i8> %a.val to i32			%a.val.cast = bitcast <4 x i8> %a.val to i32
	%b.val.cast = bitcast <4 x i8> %b.val to i32			%b.val.cast = bitcast <4 x i8> %b.val to i32
	%c.val = load i32, ptr addrspace(1) %c			%c.val = load i32, ptr addrspace(1) %c
	%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)			%r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
	store i32 %r.val, ptr addrspace(1) %r			store i32 %r.val, ptr addrspace(1) %r
	ret void			ret void
	}			}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll

	; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX906			; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX906
	; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX908			; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX908
	; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
	; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10			; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GCN,GFX10
				; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=GFX11

	declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)			declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)

	; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp			; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
	; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}			; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
	; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}			; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
	; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}			; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
				; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}}
	define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(			define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
	ptr addrspace(1) %r,			ptr addrspace(1) %r,
	ptr addrspace(1) %a,			ptr addrspace(1) %a,
	ptr addrspace(1) %b,			ptr addrspace(1) %b,
	ptr addrspace(1) %c) {			ptr addrspace(1) %c) {
	entry:			entry:
	%a.val = load <8 x i4>, ptr addrspace(1) %a			%a.val = load <8 x i4>, ptr addrspace(1) %a
	%b.val = load <8 x i4>, ptr addrspace(1) %b			%b.val = load <8 x i4>, ptr addrspace(1) %b
	%a.val.cast = bitcast <8 x i4> %a.val to i32			%a.val.cast = bitcast <8 x i4> %a.val to i32
	%b.val.cast = bitcast <8 x i4> %b.val to i32			%b.val.cast = bitcast <8 x i4> %b.val to i32
	%c.val = load i32, ptr addrspace(1) %c			%c.val = load i32, ptr addrspace(1) %c
	%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)			%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
	store i32 %r.val, ptr addrspace(1) %r			store i32 %r.val, ptr addrspace(1) %r
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp			; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
	; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}			; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
	; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}			; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
	; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}			; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
				; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0]{{$}}
	define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(			define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
	ptr addrspace(1) %r,			ptr addrspace(1) %r,
	ptr addrspace(1) %a,			ptr addrspace(1) %a,
	ptr addrspace(1) %b,			ptr addrspace(1) %b,
	ptr addrspace(1) %c) {			ptr addrspace(1) %c) {
	entry:			entry:
	%a.val = load <8 x i4>, ptr addrspace(1) %a			%a.val = load <8 x i4>, ptr addrspace(1) %a
	%b.val = load <8 x i4>, ptr addrspace(1) %b			%b.val = load <8 x i4>, ptr addrspace(1) %b
	%a.val.cast = bitcast <8 x i4> %a.val to i32			%a.val.cast = bitcast <8 x i4> %a.val to i32
	%b.val.cast = bitcast <8 x i4> %b.val to i32			%b.val.cast = bitcast <8 x i4> %b.val to i32
	%c.val = load i32, ptr addrspace(1) %c			%c.val = load i32, ptr addrspace(1) %c
	%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)			%r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
	store i32 %r.val, ptr addrspace(1) %r			store i32 %r.val, ptr addrspace(1) %r
	ret void			ret void
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support sdot4 / sdot8 intrinsics on gfx11
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 553560

llvm/docs/AMDGPUUsage.rst

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Support sdot4 / sdot8 intrinsics on gfx11ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 553560

llvm/docs/AMDGPUUsage.rst

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll

[AMDGPU] Support sdot4 / sdot8 intrinsics on gfx11
ClosedPublic