Index: lib/Target/AMDGPU/VOP3PInstructions.td =================================================================== --- lib/Target/AMDGPU/VOP3PInstructions.td +++ lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,34 +168,53 @@ class Srl : PatFrag<(ops node:$src), (srl node:$src, (i32 N))>; -foreach Bits = [8, 16, 24] in { - def srl#Bits : Srl; -} - -def and_255 : PatFrag< - (ops node:$src0), (and node:$src0, (i32 255)) ->; +foreach Bits = 1-7 in + def srl#!shl(Bits, 2) : Srl; -class Extract_U8 : PatFrag<( - ops node:$src), - !if (!eq (FromBitIndex, 24), // last element +class Extract_U : PatFrag< + (ops node:$src), + !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), + !and (!eq (BitMask, 15), !eq (FromBitIndex, 28))), // last element (!cast("srl"#FromBitIndex) node:$src), !if (!eq (FromBitIndex, 0), // first element - (and_255 node:$src), - (and_255 (!cast("srl"#FromBitIndex) node:$src))))>; + (and node:$src, (i32 BitMask)), + (and (!cast("srl"#FromBitIndex) node:$src), (i32 BitMask))))>; -// Defines patterns that extract each Index'ed 8bit from a 32bit scalar value; -foreach Index = [1, 2, 3, 4] in { - def UElt#Index : Extract_U8; -} +foreach Index = 1-4 in { + // Defines patterns that extract each Index'ed 8bit from an unsigned + // 32bit scalar value; + def U#Index#"_8bit" : Extract_U; -// Defines multiplication patterns where the multiplication is happening on each -// Index'ed 8bit of a 32bit scalar value. -foreach Index = [1, 2, 3, 4] in { + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. def MulU_Elt#Index : PatFrag< (ops node:$src0, node:$src1), - (AMDGPUmul_u24_oneuse (!cast("UElt"#Index) node:$src0), - (!cast("UElt"#Index) node:$src1))>; + (AMDGPUmul_u24_oneuse (!cast("U"#Index#"_8bit") node:$src0), + (!cast("U"#Index#"_8bit") node:$src1))>; +} + +// Different variants of dot8 patterns cause a huge increase in the compile time. +// Define non-associative/commutative add/mul to prevent permutation in the dot8 +// pattern. +def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; +def NonACAdd_oneuse : HasOneUseBinOp; + +def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; +def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp; + +foreach Index = 1-8 in { + // Defines patterns that extract each Index'ed 4bit from an unsigned + // 32bit scalar value; + def U#Index#"_4bit" : Extract_U; + + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. + def MulU#Index#"_4bit" : PatFrag< + (ops node:$src0, node:$src1), + (NonACAMDGPUmul_u24_oneuse (!cast("U"#Index#"_4bit") node:$src0), + (!cast("U"#Index#"_4bit") node:$src1))>; } class UDot2Pat : GCNPat < @@ -251,6 +270,12 @@ (V_DOT4_U32_U8 (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) >; +def : GCNPat < + !cast(!foldl((add_oneuse i32:$src2, (MulU1_4bit i32:$src0, i32:$src1)), [2, 3, 4, 5, 6, 7, 8], lhs, y, + (NonACAdd_oneuse lhs, (!cast("MulU"#y#"_4bit") i32:$src0, i32:$src1)))), + (V_DOT8_U32_U4 (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) +>; + } // End SubtargetPredicate = HasDLInsts multiclass VOP3P_Real_vi op> { Index: test/CodeGen/AMDGPU/idot8.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/idot8.ll @@ -0,0 +1,812 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DL %s + +define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_acc32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} + +; GFX789: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 28 +; GFX789-NEXT: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 28 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40018 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40008 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40004 +; GFX789-NEXT: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40018 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40008 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40004 +; GFX789-NEXT: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX789-NEXT: v_mov_b32_e32 v{{[0-9]+}} +; GFX789-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[SRC2]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], s{{[0-9]+}}, [[V2E2]], [[MAD1]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, [[V2E3]], [[MAD2]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, [[V2E4]], [[MAD3]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E5:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, [[V2E5]], [[MAD4]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E6:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, [[V2E6]], [[MAD5]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E7:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, [[V2E7]], [[MAD6]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E8:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD8:v[0-9]+]], s{{[0-9]+}}, [[V2E8]], [[MAD7]] +; GFX789-NEXT: {{buffer|flat|global}}_store_dword +; GFX789-NEXT: s_endpgm + +; GCN-DL: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 +; GCN-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 +; GCN-DL-NEXT: global_store_dword v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + + <8 x i4> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i32 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i32 + %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i32 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i32 + %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i32 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i32 + %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i32 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i32 + %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i32 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i32 + %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i32 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i32 + %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i32 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i32 + %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i32 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i32 + %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 + + %acc = load i32, i32 addrspace(1)* %dst, align 4 + %add1 = add i32 %mul0, %acc + %add2 = add i32 %add1, %mul1 + %add3 = add i32 %add2, %mul2 + %add4 = add i32 %add3, %mul3 + %add5 = add i32 %add4, %mul4 + %add6 = add i32 %add5, %mul5 + %add7 = add i32 %add6, %mul6 + %add8 = add i32 %add7, %mul7 + + store i32 %add8, i32 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the unnecessary instruction(that is zero-extending the +; 2nd MAD) to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_acc16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GFX789: s_load_dword +; GFX789: {{buffer|flat|global}}_load_ushort +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40004 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD1]] +; GFX789: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD2]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD3]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD4]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD5]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD6]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E8:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD8:v[0-9]+]], s{{[0-9]+}}, [[V2E8]], [[MAD7]] +; GFX789-NEXT: {{buffer|flat|global}}_store_short +; GFX789-NEXT: s_endpgm + +; GCN-DL: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ushort v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v6, s4 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s10 +; GCN-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s12 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: global_store_short v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i16 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i16 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i16 + %mul0 = mul nuw nsw i16 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i16 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i16 + %mul1 = mul nuw nsw i16 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i16 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i16 + %mul2 = mul nuw nsw i16 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i16 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i16 + %mul3 = mul nuw nsw i16 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i16 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i16 + %mul4 = mul nuw nsw i16 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i16 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i16 + %mul5 = mul nuw nsw i16 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i16 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i16 + %mul6 = mul nuw nsw i16 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i16 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i16 + %mul7 = mul nuw nsw i16 %cv1e7, %cv2e7 + + %acc = load i16, i16 addrspace(1)* %dst, align 4 + %add1 = add i16 %mul0, %acc + %add2 = add i16 %add1, %mul1 + %add3 = add i16 %add2, %mul2 + %add4 = add i16 %add3, %mul3 + %add5 = add i16 %add4, %mul4 + %add6 = add i16 %add5, %mul5 + %add7 = add i16 %add6, %mul6 + %add8 = add i16 %add7, %mul7 + + store i16 %add8, i16 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the unnecessary instruction(that is zero-extending the +; 2nd MAD) to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_acc8: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789: s_load_dword +; GFX789: s_load_dword +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 28 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40008 +; GFX789: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD1]] +; GFX789: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD2]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD3]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD4]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD5]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD6]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E8:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD8:v[0-9]+]], s{{[0-9]+}}, [[V2E8]], [[MAD7]] +; GFX789-NEXT: {{buffer|flat|global}}_store_byte +; GFX789-NEXT: s_endpgm + +; GCN-DL: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s14, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v6, s4 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s10 +; GCN-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s12 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + + <8 x i4> addrspace(1)* %src2, + i8 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i8 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i8 + %mul0 = mul nuw nsw i8 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i8 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i8 + %mul1 = mul nuw nsw i8 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i8 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i8 + %mul2 = mul nuw nsw i8 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i8 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i8 + %mul3 = mul nuw nsw i8 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i8 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i8 + %mul4 = mul nuw nsw i8 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i8 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i8 + %mul5 = mul nuw nsw i8 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i8 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i8 + %mul6 = mul nuw nsw i8 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i8 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i8 + %mul7 = mul nuw nsw i8 %cv1e7, %cv2e7 + + %acc = load i8, i8 addrspace(1)* %dst, align 4 + %add1 = add i8 %mul0, %acc + %add2 = add i8 %add1, %mul1 + %add3 = add i8 %add2, %mul2 + %add4 = add i8 %add3, %mul3 + %add5 = add i8 %add4, %mul4 + %add6 = add i8 %add5, %mul5 + %add7 = add i8 %add6, %mul6 + %add8 = add i8 %add7, %mul7 + + store i8 %add8, i8 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Remove the two unnecessary instructions(and+add after 2nd MAD) +; to have the pattern-recognizer to kick in. +define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_acc4: +; GCN: ; %bb.0: ; %entry +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} + +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: s_load_dword +; GFX789: s_load_dword +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD1]] +; GFX789: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD2]] +; GFX789: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD3]] +; GFX789: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD4]] +; GFX789: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD5]] +; GFX789: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD6]] +; GFX789: {{buffer|flat|global}}_store_byte +; GFX789-NEXT: s_endpgm + +; GCN-DL: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s6 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v5, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c +; GCN-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s5 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s7 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s9, s4, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s8 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s9 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i4 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %mul0 = mul nuw nsw i4 %v1e0, %v2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %mul1 = mul nuw nsw i4 %v1e1, %v2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %mul2 = mul nuw nsw i4 %v1e2, %v2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %mul3 = mul nuw nsw i4 %v1e3, %v2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %mul4 = mul nuw nsw i4 %v1e4, %v2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %mul5 = mul nuw nsw i4 %v1e5, %v2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %mul6 = mul nuw nsw i4 %v1e6, %v2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %mul7 = mul nuw nsw i4 %v1e7, %v2e7 + + %acc = load i4, i4 addrspace(1)* %dst, align 4 + %add1 = add i4 %mul0, %acc + %add2 = add i4 %add1, %mul1 + %add3 = add i4 %add2, %mul2 + %add4 = add i4 %add3, %mul3 + %add5 = add i4 %add4, %mul4 + %add6 = add i4 %add5, %mul5 + %add7 = add i4 %add6, %mul6 + %add8 = add i4 %add7, %mul7 + + store i4 %add8, i4 addrspace(1)* %dst, align 4 + ret void +} + +; TODO: Currently, permutation of udot8 is turned off due to a huge increase +; in the compile time. +define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_CommutationInsideMAD: +; GCN: ; %bb.0: ; %entry +; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} + +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: s_load_dword +; GFX789: s_load_dword +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD1]] +; GFX789: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD2]] +; GFX789: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD3]] +; GFX789: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD4]] +; GFX789: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD5]] +; GFX789: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[MAD6]] +; GFX789: {{buffer|flat|global}}_store_byte +; GFX789-NEXT: s_endpgm + +; GCN-DL: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_and_b32 s0, s2, 15 +; GCN-DL-NEXT: s_and_b32 s1, s4, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s1 +; GCN-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s5 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 +; GCN-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c +; GCN-DL-NEXT: v_mov_b32_e32 v5, s6 +; GCN-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 +; GCN-DL-NEXT: v_mov_b32_e32 v6, s7 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GCN-DL-NEXT: v_mov_b32_e32 v7, s8 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GCN-DL-NEXT: v_mov_b32_e32 v8, s9 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GCN-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GCN-DL-NEXT: v_mov_b32_e32 v9, s10 +; GCN-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GCN-DL-NEXT: s_waitcnt vmcnt(0) +; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s4 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GCN-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GCN-DL-NEXT: global_store_byte v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + + <8 x i4> addrspace(1)* %src2, + i4 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %mul0 = mul nuw nsw i4 %v1e0, %v2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %mul1 = mul nuw nsw i4 %v1e1, %v2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %mul2 = mul nuw nsw i4 %v1e2, %v2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %mul3 = mul nuw nsw i4 %v1e3, %v2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %mul4 = mul nuw nsw i4 %v1e4, %v2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %mul5 = mul nuw nsw i4 %v1e5, %v2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %mul6 = mul nuw nsw i4 %v1e6, %v2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %mul7 = mul nuw nsw i4 %v1e7, %v2e7 + + %acc = load i4, i4 addrspace(1)* %dst, align 4 + %add1 = add i4 %mul0, %acc + %add2 = add i4 %mul1, %add1 + %add3 = add i4 %mul2, %add2 + %add4 = add i4 %mul3, %add3 + %add5 = add i4 %mul4, %add4 + %add6 = add i4 %mul5, %add5 + %add7 = add i4 %mul6, %add6 + %add8 = add i4 %mul7, %add7 + + store i4 %add8, i4 addrspace(1)* %dst, align 4 + ret void +} + +define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, +; GCN-LABEL: udot8_multiuses_mul1: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} + +; GFX789: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GFX789: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 28 +; GFX789: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 28 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40018 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40008 +; GFX789: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40018 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40014 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x4000c +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40008 +; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40004 +; GFX789-NEXT: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX789-NEXT: v_mov_b32_e32 v{{[0-9]+}} +; GFX789-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[SRC2]] +; GFX789: v_mov_b32_e32 [[V2E2:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 +; GFX789: v_mov_b32_e32 [[V2E3:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}} +; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], s{{[0-9]+}}, [[V2E4]], [[MAD3]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E5:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD5:v[0-9]+]], s{{[0-9]+}}, [[V2E5]], [[MAD4]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E6:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD6:v[0-9]+]], s{{[0-9]+}}, [[V2E6]], [[MAD5]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E7:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD7:v[0-9]+]], s{{[0-9]+}}, [[V2E7]], [[MAD6]] +; GFX789-NEXT: v_mov_b32_e32 [[V2E8:v[0-9]+]] +; GFX789-NEXT: v_mad_u32_u24 [[MAD8:v[0-9]+]] +; GFX789: {{buffer|flat|global}}_store_dword +; GFX789-NEXT: s_endpgm + +; GCN-DL: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GCN-DL-NEXT: v_mov_b32_e32 v0, s0 +; GCN-DL-NEXT: v_mov_b32_e32 v1, s1 +; GCN-DL-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DL-NEXT: s_lshr_b32 s0, s2, 28 +; GCN-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GCN-DL-NEXT: s_lshr_b32 s11, s4, 28 +; GCN-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GCN-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s15, s4, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GCN-DL-NEXT: s_and_b32 s4, s4, 15 +; GCN-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 +; GCN-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 +; GCN-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GCN-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GCN-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GCN-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GCN-DL-NEXT: s_and_b32 s2, s2, 15 +; GCN-DL-NEXT: v_mov_b32_e32 v2, s4 +; GCN-DL-NEXT: v_mov_b32_e32 v3, s5 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s2, v2, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s17 +; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v3 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s10, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s16 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s9, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s15 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s8, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s14 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s7, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s13 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s6, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s12 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s1, v4, v3 +; GCN-DL-NEXT: v_mov_b32_e32 v4, s11 +; GCN-DL-NEXT: v_mad_u32_u24 v3, s0, v4, v3 +; GCN-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GCN-DL-NEXT: global_store_dword v[0:1], v2, off +; GCN-DL-NEXT: s_endpgm + <8 x i4> addrspace(1)* %src2, + i32 addrspace(1)* nocapture %dst) { +entry: + %vec1 = load <8 x i4>, <8 x i4> addrspace(1)* %src1 + %vec2 = load <8 x i4>, <8 x i4> addrspace(1)* %src2 + + %v1e0 = extractelement <8 x i4> %vec1, i64 0 + %cv1e0 = zext i4 %v1e0 to i32 + %v2e0 = extractelement <8 x i4> %vec2, i64 0 + %cv2e0 = zext i4 %v2e0 to i32 + %mul0 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i4> %vec1, i64 1 + %cv1e1 = zext i4 %v1e1 to i32 + %v2e1 = extractelement <8 x i4> %vec2, i64 1 + %cv2e1 = zext i4 %v2e1 to i32 + %mul1 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i4> %vec1, i64 2 + %cv1e2 = zext i4 %v1e2 to i32 + %v2e2 = extractelement <8 x i4> %vec2, i64 2 + %cv2e2 = zext i4 %v2e2 to i32 + %mul2 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i4> %vec1, i64 3 + %cv1e3 = zext i4 %v1e3 to i32 + %v2e3 = extractelement <8 x i4> %vec2, i64 3 + %cv2e3 = zext i4 %v2e3 to i32 + %mul3 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %v1e4 = extractelement <8 x i4> %vec1, i64 4 + %cv1e4 = zext i4 %v1e4 to i32 + %v2e4 = extractelement <8 x i4> %vec2, i64 4 + %cv2e4 = zext i4 %v2e4 to i32 + %mul4 = mul nuw nsw i32 %cv1e4, %cv2e4 + + %v1e5 = extractelement <8 x i4> %vec1, i64 5 + %cv1e5 = zext i4 %v1e5 to i32 + %v2e5 = extractelement <8 x i4> %vec2, i64 5 + %cv2e5 = zext i4 %v2e5 to i32 + %mul5 = mul nuw nsw i32 %cv1e5, %cv2e5 + + %v1e6 = extractelement <8 x i4> %vec1, i64 6 + %cv1e6 = zext i4 %v1e6 to i32 + %v2e6 = extractelement <8 x i4> %vec2, i64 6 + %cv2e6 = zext i4 %v2e6 to i32 + %mul6 = mul nuw nsw i32 %cv1e6, %cv2e6 + + %v1e7 = extractelement <8 x i4> %vec1, i64 7 + %cv1e7 = zext i4 %v1e7 to i32 + %v2e7 = extractelement <8 x i4> %vec2, i64 7 + %cv2e7 = zext i4 %v2e7 to i32 + %mul7 = mul nuw nsw i32 %cv1e7, %cv2e7 + + %acc = load i32, i32 addrspace(1)* %dst, align 4 + %add1 = add i32 %mul0, %acc + %add = add i32 %mul0, %add1 + %add2 = add i32 %add1, %mul1 + %add3 = add i32 %add2, %mul2 + %add4 = add i32 %add3, %mul3 + %add5 = add i32 %add4, %mul4 + %add6 = add i32 %add5, %mul5 + %add7 = add i32 %add6, %mul6 + %add8 = add i32 %add7, %mul7 + + %res = add i32 %add, %add8 + store i32 %res, i32 addrspace(1)* %dst, align 4 + ret void +}