diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10218,6 +10218,26 @@ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + SDValue NarrowOp = Op->getOperand(0); + auto NarrowVT = NarrowOp.getValueType(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowVT = VTSign->getVT(); + } + if (!NarrowVT.isByteSized()) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowVT.getSizeInBits() / 8; + + if (SrcIndex >= NarrowByteWidth) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10300,6 +10320,7 @@ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10322,7 +10343,10 @@ return BytesProvided - ByteShift > Index ? calculateSrcByte(Op->getOperand(0), StartingIndex, Index + ByteShift) - : ByteProvider::getConstantZero(); + : Op.getOpcode() == ISD::SRL + ? std::optional>( + ByteProvider::getConstantZero()) + : std::nullopt; } case ISD::SHL: { @@ -10346,9 +10370,14 @@ } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { SDValue NarrowOp = Op->getOperand(0); - unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowBitWidth = VTSign->getVT().getSizeInBits(); + } if (NarrowBitWidth % 8 != 0) return std::nullopt; uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10410,6 +10439,13 @@ return std::nullopt; } + case ISD::EXTRACT_VECTOR_ELT: { + if (Op.getScalarValueSizeInBits() != 32) + return std::nullopt; + + return calculateSrcByte(Op, StartingIndex, Index); + } + default: { return std::nullopt; } @@ -11991,6 +12027,50 @@ return Accum; } +// Collect the ultimate src of each of the mul24 node's operands, and confirm +// each operand is 8 bytes. +static std::optional> +handleMulOperand(const SDValue &MulOperand) { + auto Byte0 = calculateByteProvider(MulOperand, 0, 0); + if (!Byte0.has_value() || Byte0->isConstantZero()) { + return std::nullopt; + } + auto Byte1 = calculateByteProvider(MulOperand, 1, 0); + if (Byte1.has_value() && !Byte1->isConstantZero()) { + return std::nullopt; + } + return Byte0; +} + +static bool matchChain(ByteProvider &Src0, ByteProvider &Src1, + SmallVector, 4> &Src0s, + SmallVector, 4> &Src1s) { + assert(Src0.Src.has_value() && Src1.Src.has_value()); + bool Placed = false; + int Prev = -1; + for (auto &BP : {Src0, Src1}) { + for (int I = 0; I < 2; I++) { + // Operands from mul24 cannot provide for the same src chain + if (I == Prev) + continue; + SmallVector, 4> &Srcs = I == 0 ? Src0s : Src1s; + // Matches chain if it is using the next least significant byte of the + // same src + if (*BP.Src == Srcs[0].Src && + (BP.SrcOffset == (Srcs[Srcs.size() - 1].SrcOffset - 1))) { + Srcs.push_back(BP); + // We have provided for both chains + if (Prev != -1) { + Placed = true; + break; + } + Prev = I; + } + } + } + return Placed; +} + SDValue SITargetLowering::performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12046,6 +12126,147 @@ return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); } } + + // v_dot4 combining + auto ST = getSubtarget(); + if ((LHS.getOpcode() == AMDGPUISD::MUL_I24 || + LHS.getOpcode() == AMDGPUISD::MUL_U24 || + RHS.getOpcode() == AMDGPUISD::MUL_I24 || + RHS.getOpcode() == AMDGPUISD::MUL_U24) && + ST->hasDot7Insts() && (ST->hasDot1Insts() || ST->hasDot8Insts())) { + auto TempNode = SDValue(N, 0); + auto MulIdx = (LHS.getOpcode() == AMDGPUISD::MUL_I24 || + LHS.getOpcode() == AMDGPUISD::MUL_U24) + ? 0 + : 1; + auto MulOpcode = TempNode.getOperand(MulIdx).getOpcode(); + SmallVector, 4> Src0s; + SmallVector, 4> Src1s; + SmallVector Src2s; + + // Match the v_dot4 tree, while collecting src nodes. + for (int I = 0; I < 4; I++) { + if (LHS.getOpcode() != MulOpcode && RHS.getOpcode() != MulOpcode) + break; + auto MulIdx = LHS.getOpcode() == MulOpcode ? 0 : 1; + auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); + if (!Src0.has_value()) + break; + auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); + if (!Src1.has_value()) + break; + if (I == 0) { + assert(Src0->Src.has_value() && Src1->Src.has_value()); + Src0s.push_back(*Src0); + Src1s.push_back(*Src1); + } else if (!matchChain(*Src0, *Src1, Src0s, Src1s)) + break; + auto AddIdx = 1 - MulIdx; + // Allow the special case where add (add (mul24, 0), mul24) became -> + // add (mul24, mul24) + if (I == 2 && (TempNode->getOperand(AddIdx).getOpcode() == MulOpcode)) { + Src2s.push_back(TempNode->getOperand(AddIdx)); + auto Src0 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); + if (!Src0.has_value()) + break; + auto Src1 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); + if (!Src1.has_value()) + break; + if (!matchChain(*Src0, *Src1, Src0s, Src1s)) + break; + Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); + break; + } + + TempNode = TempNode->getOperand(AddIdx); + LHS = TempNode->getOperand(0); + RHS = TempNode->getOperand(1); + Src2s.push_back(TempNode); + } + + auto ChainLength = std::min(Src0s.size(), Src1s.size()); + if (ChainLength < 2) + return SDValue(); + + auto Src0Start = Src0s[ChainLength - 1].SrcOffset; + auto Src1Start = Src1s[ChainLength - 1].SrcOffset; + + auto Src0 = *Src0s[0].Src; + auto Src1 = *Src1s[0].Src; + if (Src0.getValueSizeInBits() < 16 || Src1.getValueSizeInBits() < 16) + return SDValue(); + + if (Src0.getValueSizeInBits() > 32) + Src0 = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src0); + + if (Src1.getValueSizeInBits() > 32) + Src1 = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src1); + + if (ChainLength < 4) { + // TODO -- combine well formed 2 x v2i8 Src0/1s into a single v_dot4 + assert((Src0.getValueSizeInBits() == 32 && + Src1.getValueSizeInBits() == 32) || + ChainLength == 2); + if (Src0.getValueSizeInBits() == 32) { + assert(Src0Start + ChainLength < 5); + auto BitMask = ChainLength == 2 + ? (0x0c0c0000 + 0x0100 + 0x0101 * Src0Start) + : (0x0c + 0x020100 + 0x010101 * Src0Start); + Src0 = DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, Src0, Src0, + DAG.getConstant(BitMask, SL, MVT::i32)); + } + if (Src1.getValueSizeInBits() == 32) { + assert(Src1Start + ChainLength < 5); + auto BitMask = ChainLength == 2 + ? (0x0c0c0000 + 0x0100 + 0x0101 * Src1Start) + : (0x0c + 0x020100 + 0x010101 * Src1Start); + Src1 = DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, Src1, Src1, + DAG.getConstant(BitMask, SL, MVT::i32)); + } + } + + if (Src0.getValueSizeInBits() == 16) + Src0 = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Src0); + if (Src1.getValueSizeInBits() == 16) + Src1 = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Src0); + + SDValue Src2 = Src2s[ChainLength - 1]; + + std::optional Opcode; + if (ST->getGeneration() == AMDGPUSubtarget::GFX11) { + Opcode = MulOpcode == AMDGPUISD::MUL_U24 ? AMDGPU::V_DOT4_U32_U8_gfx11 + : AMDGPU::V_DOT4_I32_IU8_gfx11; + } + if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { + Opcode = MulOpcode == AMDGPUISD::MUL_U24 ? AMDGPU::V_DOT4_U32_U8_gfx10 + : AMDGPU::V_DOT4_I32_I8_gfx10; + } + + if (ST->getGeneration() == AMDGPUSubtarget::GFX9) { + Opcode = MulOpcode == AMDGPUISD::MUL_U24 ? AMDGPU::V_DOT4_U32_U8 + : AMDGPU::V_DOT4_I32_I8; + } + + if (!Opcode.has_value()) + return SDValue(); + + SmallVector Ops = {DAG.getTargetConstant(8, SL, MVT::i32), + Src0, + DAG.getTargetConstant(8, SL, MVT::i32), + Src1, + DAG.getTargetConstant(8, SL, MVT::i32), + Src2, + DAG.getTargetConstant(0, SL, MVT::i1), + DAG.getTargetConstant(0, SL, MVT::i1), + DAG.getTargetConstant(0, SL, MVT::i1), + DAG.getTargetConstant(0, SL, MVT::i1), + DAG.getTargetConstant(0, SL, MVT::i1)}; + SmallVector VTList = {MVT::i32}; + return SDValue(DAG.getMachineNode(*Opcode, SL, VTList, Ops), 0); + } + return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -290,22 +290,6 @@ (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; -foreach Type = ["I", "U"] in - foreach Index = 0-3 in { - // Defines patterns that extract each Index'ed 8bit from an unsigned - // 32bit scalar value; - def Type#Index#"_8bit" : Extract; - - // Defines multiplication patterns where the multiplication is happening on each - // Index'ed 8bit of a 32bit scalar value. - - def Mul#Type#_Elt#Index : PatFrag< - (ops node:$src0, node:$src1), - (!cast(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) - (!cast(Type#Index#"_8bit") node:$src0), - (!cast(Type#Index#"_8bit") node:$src1))>; - } - // Different variants of dot8 patterns cause a huge increase in the compile time. // Define non-associative/commutative add/mul to prevent permutation in the dot8 // pattern. @@ -421,13 +405,6 @@ def : UDot2Pat; def : SDot2Pat; -foreach Type = ["U", "I"] in - let SubtargetPredicate = !cast("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in - def : GCNPat < - !cast(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, - (add_oneuse lhs, (!cast("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), - (!cast("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; - foreach Type = ["U", "I"] in let SubtargetPredicate = !cast("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in def : GCNPat < diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -5,6 +5,9 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s + define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: @@ -127,6 +130,23 @@ ; GFX10-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -358,6 +378,45 @@ ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc16: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -547,6 +606,35 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v2, v0, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -683,23 +771,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 +; GFX9-DL-NEXT: v_perm_b32 v6, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v5, v2, v2, s0 +; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 -; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2 -; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-DL-NEXT: v_dot4_i32_i8 v5, v6, v5, s4 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, v5 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -710,24 +800,52 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 +; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_i32_i24_e32 v5, v0, v3 +; GFX10-DL-NEXT: v_perm_b32 v3, v2, v2, 0xc0c0100 +; GFX10-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v5, v1, 0, 8 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0302 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_i32_i8 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_dot4_i32_i8 v0, v2, v1, v0 +; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_multiuse_mul1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_perm_b32 v2, v1, v1, 0xc0c0100 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_perm_b32 v3, v0, v0, 0xc0c0100 +; GFX11-DL-NEXT: v_bfe_i32 v4, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v5, v1, 0, 8 +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_dot4_i32_iu8 v2, v3, v2, s2 +; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v4, v5, v2 +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, v2 +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -879,17 +997,8 @@ ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v2, v5, s0, v3 -; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -902,21 +1011,29 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1139,6 +1256,52 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc16_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 8, v1 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_perm_b32 v0, v7, v0, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1171,23 +1171,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0100 +; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX9-DL-NEXT: v_perm_b32 v6, v1, v1, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v5, v2, v2, s0 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, s0 -; GFX9-DL-NEXT: v_add3_u32 v2, v5, v3, v2 -; GFX9-DL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v5, v6, v5, s4 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, v5 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1198,23 +1200,23 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 +; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 +; GFX10-DL-NEXT: v_perm_b32 v3, v2, v2, 0xc0c0100 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0302 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v4, v0, v5 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v4, v5, v0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v1, v0 +; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1364,23 +1366,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0302 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 8, 8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, s0 -; GFX9-DL-NEXT: v_add_u32_e32 v4, s0, v2 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v3, v6 -; GFX9-DL-NEXT: v_add3_u32 v1, v2, v1, v4 +; GFX9-DL-NEXT: v_mad_u32_u24 v5, v5, v6, s1 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v3, v4, v5 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, v5, s1, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1398,16 +1402,17 @@ ; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 8, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0302 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v4, v3 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX10-DL-NEXT: v_mad_u32_u24 v3, v4, v5, v0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, s2, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1767,14 +1772,8 @@ ; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_add3_u32 v2, v3, s0, v4 -; GFX9-DL-NEXT: v_add3_u32 v1, v2, v5, v1 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1787,21 +1786,11 @@ ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] -; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_add3_u32 v0, v4, s2, v0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -538,11 +538,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-UNPACKED-NEXT: v_perm_b32 v0, v1, v2, s0 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4 @@ -637,12 +637,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] +; GFX8-UNPACKED-NEXT: v_perm_b32 v3, v3, v4, s0 +; GFX8-UNPACKED-NEXT: v_perm_b32 v2, v1, v2, s0 +; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1671,18 +1671,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v0, v0, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1827,18 +1826,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v1, v1, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -234,9 +234,9 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v2 ; TONGA-NEXT: ; return to shader part epilog ; @@ -284,9 +284,9 @@ ; TONGA-NEXT: v_mov_b32_e32 v6, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v5 ; TONGA-NEXT: v_mov_b32_e32 v2, v6 ; TONGA-NEXT: ; return to shader part epilog @@ -370,11 +370,10 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 +; TONGA-NEXT: v_perm_b32 v1, v2, v3, s0 ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_b_2d_v4f16: @@ -421,11 +420,10 @@ ; TONGA-NEXT: v_mov_b32_e32 v7, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 +; TONGA-NEXT: v_perm_b32 v1, v5, v6, s0 ; TONGA-NEXT: v_mov_b32_e32 v2, v7 ; TONGA-NEXT: ; return to shader part epilog ;