diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10444,8 +10444,7 @@ if (Depth >= 6) return std::nullopt; - auto ValueSize = Op.getValueSizeInBits(); - if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32) + if (Op.getValueSizeInBits() < 8) return std::nullopt; switch (Op->getOpcode()) { @@ -10679,8 +10678,6 @@ auto VecIdx = IdxOp->getZExtValue(); auto ScalarSize = Op.getScalarValueSizeInBits(); if (ScalarSize != 32) { - if ((VecIdx + 1) * ScalarSize > 32) - return std::nullopt; Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; } @@ -10897,8 +10894,8 @@ if (PermNodes.size() != 4) return SDValue(); - int FirstSrc = 0; - std::optional SecondSrc; + std::pair FirstSrc(0, PermNodes[0].SrcOffset / 4); + std::optional> SecondSrc; uint64_t PermMask = 0x00000000; for (size_t i = 0; i < PermNodes.size(); i++) { auto PermOp = PermNodes[i]; @@ -10906,27 +10903,37 @@ // by sizeof(Src2) = 4 int SrcByteAdjust = 4; - if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { - if (SecondSrc.has_value()) - if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + // If the Src uses a byte from a different DWORD, then it corresponds + // with a difference source + if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) || + ((PermOp.SrcOffset / 4) != FirstSrc.second)) { + if (SecondSrc) + if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) || + ((PermOp.SrcOffset / 4) != SecondSrc->second)) return SDValue(); // Set the index of the second distinct Src node - SecondSrc = i; - assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); + SecondSrc = {i, PermNodes[i].SrcOffset / 4}; + assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); SrcByteAdjust = 0; } - assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8); assert(!DAG.getDataLayout().isBigEndian()); - PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8); + } + SDLoc DL(N); + SDValue Op = *PermNodes[FirstSrc.first].Src; + + if (Op.getValueSizeInBits() > 32) { + Op = DAG.getBitcast(MVT::getIntegerVT(Op.getValueSizeInBits()), Op); + if (FirstSrc.second) + Op = DAG.getNode(ISD::SRL, DL, Op.getValueType(), Op, + DAG.getConstant(32 * FirstSrc.second, DL, MVT::i32)); + Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op); } - - SDValue Op = *PermNodes[FirstSrc].Src; - SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src - : *PermNodes[FirstSrc].Src; // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp && Op.getValueSizeInBits() == 32) { + if (!SecondSrc && Op.getValueSizeInBits() == 32) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; @@ -10938,8 +10945,21 @@ return DAG.getBitcast(MVT::getIntegerVT(32), Op); } + SDValue OtherOp = + SecondSrc.has_value() ? *PermNodes[SecondSrc->first].Src : Op; + + if (SecondSrc && OtherOp.getValueSizeInBits() > 32) { + OtherOp = DAG.getBitcast( + MVT::getIntegerVT(OtherOp.getValueSizeInBits()), OtherOp); + if (SecondSrc->second) + OtherOp = DAG.getNode( + ISD::SRL, DL, OtherOp.getValueType(), OtherOp, + DAG.getConstant(32 * SecondSrc->second, DL, MVT::i32)); + OtherOp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, OtherOp); + } + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { - SDLoc DL(N); + assert(Op.getValueType().isByteSized() && OtherOp.getValueType().isByteSized()); @@ -12311,17 +12331,24 @@ return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); } +struct DotSrc { + SDValue SrcOp; + int64_t PermMask; + int64_t DWordOffset; +}; + static void placeSources(ByteProvider &Src0, ByteProvider &Src1, - SmallVectorImpl> &Src0s, - SmallVectorImpl> &Src1s, - int Step) { + SmallVectorImpl &Src0s, + SmallVectorImpl &Src1s, int Step) { assert(Src0.Src.has_value() && Src1.Src.has_value()); // Src0s and Src1s are empty, just place arbitrarily if (Step == 0) { - Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c}); - Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c}); + Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c, + Src0.SrcOffset / 4}); + Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c, + Src1.SrcOffset / 4}); return; } @@ -12334,38 +12361,38 @@ unsigned FMask = 0xFF << (8 * (3 - Step)); unsigned FirstMask = - BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); unsigned SecondMask = - BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); // Attempt to find Src vector which contains our SDValue, if so, add our // perm mask to the existing one. If we are unable to find a match for the // first SDValue, attempt to find match for the second. int FirstGroup = -1; for (int I = 0; I < 2; I++) { - SmallVectorImpl> &Srcs = - I == 0 ? Src0s : Src1s; - auto MatchesFirst = [&BPP](std::pair IterElt) { - return IterElt.first == *BPP.first.Src; + SmallVectorImpl &Srcs = I == 0 ? Src0s : Src1s; + auto MatchesFirst = [&BPP](DotSrc &IterElt) { + return IterElt.SrcOp == *BPP.first.Src && + (IterElt.DWordOffset == (BPP.first.SrcOffset / 4)); }; auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesFirst); if (Match != Srcs.end()) { - Match->second = addPermMasks(FirstMask, Match->second); + Match->PermMask = addPermMasks(FirstMask, Match->PermMask); FirstGroup = I; break; } } if (FirstGroup != -1) { - SmallVectorImpl> &Srcs = - FirstGroup == 1 ? Src0s : Src1s; - auto MatchesSecond = [&BPP](std::pair IterElt) { - return IterElt.first == *BPP.second.Src; + SmallVectorImpl &Srcs = FirstGroup == 1 ? Src0s : Src1s; + auto MatchesSecond = [&BPP](DotSrc &IterElt) { + return IterElt.SrcOp == *BPP.second.Src && + (IterElt.DWordOffset == (BPP.second.SrcOffset / 4)); }; auto Match = std::find_if(Srcs.begin(), Srcs.end(), MatchesSecond); if (Match != Srcs.end()) { - Match->second = addPermMasks(SecondMask, Match->second); + Match->PermMask = addPermMasks(SecondMask, Match->PermMask); } else - Srcs.push_back({*BPP.second.Src, SecondMask}); + Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4}); return; } } @@ -12377,28 +12404,38 @@ unsigned FMask = 0xFF << (8 * (3 - Step)); Src0s.push_back( - {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + {*Src0.Src, + ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), + Src1.SrcOffset / 4}); Src1s.push_back( - {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + {*Src1.Src, + ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), + Src1.SrcOffset / 4}); return; } -static SDValue -resolveSources(SelectionDAG &DAG, SDLoc SL, - SmallVectorImpl> &Srcs, - bool IsSigned, bool IsAny) { +static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, + SmallVectorImpl &Srcs, bool IsSigned, + bool IsAny) { // If we just have one source, just permute it accordingly. if (Srcs.size() == 1) { auto Elt = Srcs.begin(); - auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32); + auto EltOp = DAG.getBitcastedAnyExtOrTrunc( + Elt->SrcOp, SL, MVT::getIntegerVT(Elt->SrcOp.getValueSizeInBits())); + if (Elt->DWordOffset) { + EltOp = DAG.getNode(ISD::SRL, SL, EltOp.getValueType(), EltOp, + DAG.getConstant(Elt->DWordOffset * 32, SL, MVT::i32)); + } - if (Elt->second == 0x3020100) - return EltVal; + EltOp = DAG.getBitcastedAnyExtOrTrunc(EltOp, SL, MVT::i32); + if (Elt->PermMask == 0x3020100) { + return EltOp; + } - return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, - DAG.getConstant(Elt->second, SL, MVT::i32)); + return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, + DAG.getConstant(Elt->PermMask, SL, MVT::i32)); } auto FirstElt = Srcs.begin(); @@ -12409,8 +12446,26 @@ // If we have multiple sources in the chain, combine them via perms (using // calculated perm mask) and Ors. while (true) { - auto FirstMask = FirstElt->second; - auto SecondMask = SecondElt->second; + auto FirstEltOp = DAG.getBitcastedAnyExtOrTrunc( + FirstElt->SrcOp, SL, + MVT::getIntegerVT(FirstElt->SrcOp.getValueSizeInBits())); + if (FirstElt->DWordOffset) { + FirstEltOp = DAG.getNode( + ISD::SRL, SL, FirstEltOp.getValueType(), FirstEltOp, + DAG.getConstant(FirstElt->DWordOffset * 32, SL, MVT::i32)); + } + + auto SecondEltOp = DAG.getBitcastedAnyExtOrTrunc( + SecondElt->SrcOp, SL, + MVT::getIntegerVT(SecondElt->SrcOp.getValueSizeInBits())); + if (SecondElt->DWordOffset) { + SecondEltOp = DAG.getNode( + ISD::SRL, SL, SecondEltOp.getValueType(), SecondEltOp, + DAG.getConstant(SecondElt->DWordOffset * 32, SL, MVT::i32)); + } + + auto FirstMask = FirstElt->PermMask; + auto SecondMask = SecondElt->PermMask; unsigned FirstCs = FirstMask & 0x0c0c0c0c; unsigned FirstPlusFour = FirstMask | 0x04040404; @@ -12419,10 +12474,8 @@ FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; auto PermMask = addPermMasks(FirstMask, SecondMask); - auto FirstVal = - DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); - auto SecondVal = - DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32); + auto FirstVal = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL, MVT::i32); + auto SecondVal = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL, MVT::i32); Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, SecondVal, @@ -12436,12 +12489,20 @@ // If we only have a FirstElt, then just combine that into the cumulative // source node if (SecondElt == Srcs.end()) { - auto EltVal = - DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + auto EltOp = DAG.getBitcastedAnyExtOrTrunc( + FirstElt->SrcOp, SL, + MVT::getIntegerVT(FirstElt->SrcOp.getValueSizeInBits())); + if (FirstElt->DWordOffset) { + EltOp = DAG.getNode( + ISD::SRL, SL, EltOp.getValueType(), EltOp, + DAG.getConstant(FirstElt->DWordOffset * 32, SL, MVT::i32)); + } + + EltOp = DAG.getBitcastedAnyExtOrTrunc(EltOp, SL, MVT::i32); Perms.push_back( - DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, - DAG.getConstant(FirstElt->second, SL, MVT::i32))); + DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp, + DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); break; } } @@ -12452,9 +12513,8 @@ : Perms[0]; } -static void fixMasks(SmallVectorImpl> &Srcs, - unsigned ChainLength) { - for (auto &[EntryVal, EntryMask] : Srcs) { +static void fixMasks(SmallVectorImpl &Srcs, unsigned ChainLength) { + for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) { EntryMask = EntryMask >> ((4 - ChainLength) * 8); auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; EntryMask += ZeroMask; @@ -12498,8 +12558,9 @@ (MulOpcode == ISD::MUL && TempNode->getOperand(MulIdx)->getFlags().hasNoSignedWrap() && !TempNode->getOperand(MulIdx)->getFlags().hasNoUnsignedWrap()); - SmallVector, 4> Src0s; - SmallVector, 4> Src1s; + + SmallVector Src0s; + SmallVector Src1s; SmallVector Src2s; // Match the v_dot4 tree, while collecting src nodes. @@ -12569,11 +12630,11 @@ // (commutation) bool UseOriginalSrc = false; if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && - Src0s.begin()->second == Src1s.begin()->second && - Src0s.begin()->first.getValueSizeInBits() == 32 && - Src1s.begin()->first.getValueSizeInBits() == 32) { + Src0s.begin()->PermMask == Src1s.begin()->PermMask && + Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && + Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { SmallVector SrcBytes; - auto Src0Mask = Src0s.begin()->second; + auto Src0Mask = Src0s.begin()->PermMask; SrcBytes.push_back(Src0Mask & 0xFF000000); bool UniqueEntries = true; for (auto I = 1; I < 4; I++) { @@ -12588,11 +12649,31 @@ if (UniqueEntries) { UseOriginalSrc = true; - // Must be 32 bits to enter above conditional - assert(Src0s.begin()->first.getValueSizeInBits() == 32); - assert(Src1s.begin()->first.getValueSizeInBits() == 32); - Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first); - Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first); + + auto FirstElt = Src0s.begin(); + auto FirstEltOp = DAG.getBitcastedAnyExtOrTrunc( + FirstElt->SrcOp, SL, + MVT::getIntegerVT(FirstElt->SrcOp.getValueSizeInBits())); + if (FirstElt->DWordOffset) { + FirstEltOp = DAG.getNode( + ISD::SRL, SL, FirstEltOp.getValueType(), FirstEltOp, + DAG.getConstant(FirstElt->DWordOffset * 32, SL, MVT::i32)); + } + + auto SecondElt = Src1s.begin(); + auto SecondEltOp = DAG.getBitcastedAnyExtOrTrunc( + SecondElt->SrcOp, SL, + MVT::getIntegerVT(SecondElt->SrcOp.getValueSizeInBits())); + if (SecondElt->DWordOffset) { + SecondEltOp = DAG.getNode( + ISD::SRL, SL, SecondEltOp.getValueType(), SecondEltOp, + DAG.getConstant(SecondElt->DWordOffset * 32, SL, MVT::i32)); + } + + Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL, + MVT::getIntegerVT(32)); + Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL, + MVT::getIntegerVT(32)); } } diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -4580,4 +4580,901 @@ ret void } +define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1, +; GFX7-LABEL: idot4_acc32_hilo: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 offset:4 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: idot4_acc32_hilo: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v2 +; GFX8-NEXT: v_mul_u32_u24_sdwa v7, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v3, v6, v3, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_mad_u32_u24 v3, v8, v5, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v4, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: idot4_acc32_hilo: +; GFX9-NODL: ; %bb.0: ; %entry +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: idot4_acc32_hilo: +; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: idot4_acc32_hilo: +; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0 +; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_hilo: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_nop 0 +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + + %v1e0 = extractelement <8 x i8> %vec1, i64 4 + %cv1e0 = zext i8 %v1e0 to i32 + %v2e0 = extractelement <8 x i8> %vec2, i64 0 + %cv2e0 = zext i8 %v2e0 to i32 + %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i8> %vec1, i64 5 + %cv1e1 = zext i8 %v1e1 to i32 + %v2e1 = extractelement <8 x i8> %vec2, i64 1 + %cv2e1 = zext i8 %v2e1 to i32 + %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i8> %vec1, i64 6 + %cv1e2 = zext i8 %v1e2 to i32 + %v2e2 = extractelement <8 x i8> %vec2, i64 2 + %cv2e2 = zext i8 %v2e2 to i32 + %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i8> %vec1, i64 7 + %cv1e3 = zext i8 %v1e3 to i32 + %v2e3 = extractelement <8 x i8> %vec2, i64 3 + %cv2e3 = zext i8 %v2e3 to i32 + %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %add1 = add i32 %mul1, 0 + %add2 = add i32 %add1, %mul2 + %add3 = add i32 %add2, %mul3 + %add4 = add i32 %add3, %mul4 + store i32 %add4, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1, +; GFX7-LABEL: idot4_acc32_lohi: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: idot4_acc32_lohi: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 +; GFX8-NEXT: v_bfe_u32 v8, v2, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v3, v3, v7, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_mad_u32_u24 v3, v5, v8, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v2, v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: idot4_acc32_lohi: +; GFX9-NODL: ; %bb.0: ; %entry +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: idot4_acc32_lohi: +; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x10302 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: idot4_acc32_lohi: +; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-DL-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x10302 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_lohi: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x10302 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_nop 0 +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + + %v1e0 = extractelement <8 x i8> %vec1, i64 0 + %cv1e0 = zext i8 %v1e0 to i32 + %v2e0 = extractelement <8 x i8> %vec2, i64 7 + %cv2e0 = zext i8 %v2e0 to i32 + %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i8> %vec1, i64 1 + %cv1e1 = zext i8 %v1e1 to i32 + %v2e1 = extractelement <8 x i8> %vec2, i64 6 + %cv2e1 = zext i8 %v2e1 to i32 + %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i8> %vec1, i64 2 + %cv1e2 = zext i8 %v1e2 to i32 + %v2e2 = extractelement <8 x i8> %vec2, i64 5 + %cv2e2 = zext i8 %v2e2 to i32 + %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i8> %vec1, i64 3 + %cv1e3 = zext i8 %v1e3 to i32 + %v2e3 = extractelement <8 x i8> %vec2, i64 4 + %cv2e3 = zext i8 %v2e3 to i32 + %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %add1 = add i32 %mul1, 0 + %add2 = add i32 %add1, %mul2 + %add3 = add i32 %add2, %mul3 + %add4 = add i32 %add3, %mul4 + store i32 %add4, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1, +; GFX7-LABEL: idot4_acc32_hihi: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_u32 v3, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v5, v0, 16, 8 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: idot4_acc32_hihi: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 8 +; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8 +; GFX8-NEXT: v_mad_u32_u24 v4, v7, v8, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: idot4_acc32_hihi: +; GFX9-NODL: ; %bb.0: ; %entry +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5] offset:4 +; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7] offset:4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, v4, v5 +; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 +; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: idot4_acc32_hihi: +; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x1030200 +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3010002 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 +; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, 0 +; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: idot4_acc32_hihi: +; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4 +; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5] offset:4 +; GFX10-DL-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0x1030200 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_hihi: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7] offset:4 +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0x1030200 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_nop 0 +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + + %v1e0 = extractelement <8 x i8> %vec1, i64 4 + %cv1e0 = zext i8 %v1e0 to i32 + %v2e0 = extractelement <8 x i8> %vec2, i64 6 + %cv2e0 = zext i8 %v2e0 to i32 + %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i8> %vec1, i64 6 + %cv1e1 = zext i8 %v1e1 to i32 + %v2e1 = extractelement <8 x i8> %vec2, i64 4 + %cv2e1 = zext i8 %v2e1 to i32 + %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i8> %vec1, i64 5 + %cv1e2 = zext i8 %v1e2 to i32 + %v2e2 = extractelement <8 x i8> %vec2, i64 7 + %cv2e2 = zext i8 %v2e2 to i32 + %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i8> %vec1, i64 7 + %cv1e3 = zext i8 %v1e3 to i32 + %v2e3 = extractelement <8 x i8> %vec2, i64 5 + %cv2e3 = zext i8 %v2e3 to i32 + %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %add1 = add i32 %mul1, 0 + %add2 = add i32 %add1, %mul2 + %add3 = add i32 %add2, %mul3 + %add4 = add i32 %add3, %mul4 + store i32 %add4, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1, +; GFX7-LABEL: idot4_acc32_v8i8: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, s3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX7-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: idot4_acc32_v8i8: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 +; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 8 +; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v3, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: idot4_acc32_v8i8: +; GFX9-NODL: ; %bb.0: ; %entry +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v3, v4, v5 +; GFX9-NODL-NEXT: v_add3_u32 v0, v1, v6, v0 +; GFX9-NODL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: idot4_acc32_v8i8: +; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 +; GFX9-DL-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: idot4_acc32_v8i8: +; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_v8i8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_nop 0 +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 + + + %v1e0 = extractelement <8 x i8> %vec1, i64 0 + %cv1e0 = zext i8 %v1e0 to i32 + %v2e0 = extractelement <8 x i8> %vec1, i64 4 + %cv2e0 = zext i8 %v2e0 to i32 + %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <8 x i8> %vec1, i64 1 + %cv1e1 = zext i8 %v1e1 to i32 + %v2e1 = extractelement <8 x i8> %vec1, i64 5 + %cv2e1 = zext i8 %v2e1 to i32 + %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <8 x i8> %vec1, i64 2 + %cv1e2 = zext i8 %v1e2 to i32 + %v2e2 = extractelement <8 x i8> %vec1, i64 6 + %cv2e2 = zext i8 %v2e2 to i32 + %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <8 x i8> %vec1, i64 3 + %cv1e3 = zext i8 %v1e3 to i32 + %v2e3 = extractelement <8 x i8> %vec1, i64 7 + %cv2e3 = zext i8 %v2e3 to i32 + %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %add1 = add i32 %mul1, 0 + %add2 = add i32 %add1, %mul2 + %add3 = add i32 %add2, %mul3 + %add4 = add i32 %add3, %mul4 + store i32 %add4, ptr addrspace(1) %dst, align 4 + ret void +} + +define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1, +; GFX7-LABEL: idot4_acc32_v16i8: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7] +; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, v2 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[8:11], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, v2, v5 +; GFX7-NEXT: v_bfe_u32 v6, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v4, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v7, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v0, v1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: idot4_acc32_v16i8: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2] +; GFX8-NEXT: flat_load_dword v4, v[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4 +; GFX8-NEXT: v_mul_u32_u24_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 8 +; GFX8-NEXT: v_bfe_u32 v8, v4, 16, 8 +; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-NODL-LABEL: idot4_acc32_v16i8: +; GFX9-NODL: ; %bb.0: ; %entry +; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NODL-NEXT: ; kill: killed $vgpr5 +; GFX9-NODL-NEXT: ; kill: killed $vgpr4 +; GFX9-NODL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v0, v5, s[6:7] +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v2 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-NODL-NEXT: v_add3_u32 v0, v2, v6, v0 +; GFX9-NODL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NODL-NEXT: s_endpgm +; +; GFX9-DL-LABEL: idot4_acc32_v16i8: +; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-DL-NEXT: s_mov_b32 s0, 0x7050002 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] +; GFX9-DL-NEXT: global_load_dword v0, v5, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s1, 0x3020001 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DL-NEXT: ; kill: killed $vgpr5 +; GFX9-DL-NEXT: ; kill: killed $vgpr4 +; GFX9-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX9-DL-NEXT: v_perm_b32 v2, v3, v2, s0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_perm_b32 v0, v0, v0, s1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, 0 +; GFX9-DL-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-DL-NEXT: s_endpgm +; +; GFX10-DL-LABEL: idot4_acc32_v16i8: +; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: ; kill: killed $vgpr5 +; GFX10-DL-NEXT: ; kill: killed $vgpr4 +; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] +; GFX10-DL-NEXT: global_load_dword v0, v5, s[6:7] +; GFX10-DL-NEXT: s_waitcnt vmcnt(1) +; GFX10-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 +; GFX10-DL-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_v16i8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_perm_b32 v1, v3, v2, 0x7050002 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_nop 0 +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx + %vec1 = load <16 x i8>, ptr addrspace(1) %gep1 + %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx + %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 + + %v1e0 = extractelement <16 x i8> %vec1, i64 8 + %cv1e0 = zext i8 %v1e0 to i32 + %v2e0 = extractelement <8 x i8> %vec2, i64 0 + %cv2e0 = zext i8 %v2e0 to i32 + %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 + + %v1e1 = extractelement <16 x i8> %vec1, i64 10 + %cv1e1 = zext i8 %v1e1 to i32 + %v2e1 = extractelement <8 x i8> %vec2, i64 1 + %cv2e1 = zext i8 %v2e1 to i32 + %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 + + %v1e2 = extractelement <16 x i8> %vec1, i64 13 + %cv1e2 = zext i8 %v1e2 to i32 + %v2e2 = extractelement <8 x i8> %vec2, i64 2 + %cv2e2 = zext i8 %v2e2 to i32 + %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 + + %v1e3 = extractelement <16 x i8> %vec1, i64 15 + %cv1e3 = zext i8 %v1e3 to i32 + %v2e3 = extractelement <8 x i8> %vec2, i64 3 + %cv2e3 = zext i8 %v2e3 to i32 + %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 + + %add1 = add i32 %mul1, 0 + %add2 = add i32 %add1, %mul2 + %add3 = add i32 %add2, %mul3 + %add4 = add i32 %add3, %mul4 + store i32 %add4, ptr addrspace(1) %dst, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2226,14 +2226,12 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_lshl_b32 s1, s4, 16 -; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v3, s2, v3, v3 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -2308,14 +2306,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s2, 0xffff ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v3, s2, v6, v3 -; VI-NEXT: v_bfi_b32 v1, s2, v1, v1 +; VI-NEXT: v_bfi_b32 v3, s0, v6, v3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -2966,3 +2966,226 @@ store i32 %res, ptr addrspace(1) %out0, align 4 ret void } + +define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract_hilo: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3060505 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_hilo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x3060505 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 + %v1e5 = extractelement <8 x i8> %vec1, i64 5 + %zv1e5 = zext i8 %v1e5 to i32 + %byte1 = shl i32 %zv1e5, 8 + + %v1e6 = extractelement <8 x i8> %vec1, i64 6 + %zv1e6 = zext i8 %v1e6 to i32 + %byte2 = shl i32 %zv1e6, 16 + %v2e3 = extractelement <8 x i8> %vec2, i64 3 + %zv2e3 = zext i8 %v2e3 to i32 + %byte3 = shl i32 %zv2e3, 24 + + %tmp0 = or i32 %zv1e5, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract_lohi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x70404 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_lohi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x70404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <8 x i8> %vec1, i64 0 + %zv1e0 = zext i8 %v1e0 to i32 + %byte1 = shl i32 %zv1e0, 8 + + %v1e3 = extractelement <8 x i8> %vec1, i64 3 + %zv1e3 = zext i8 %v1e3 to i32 + %byte2 = shl i32 %zv1e3, 16 + %v2e4 = extractelement <8 x i8> %vec2, i64 4 + %zv2e4 = zext i8 %v2e4 to i32 + %byte3 = shl i32 %zv2e4, 24 + + %tmp0 = or i32 %zv1e0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract_hihi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX10-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2070505 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_hihi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v7, v[2:3], off offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0x2070505 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 + %v1e5 = extractelement <8 x i8> %vec1, i64 5 + %zv1e5 = zext i8 %v1e5 to i32 + %byte1 = shl i32 %zv1e5, 8 + + %v1e7 = extractelement <8 x i8> %vec1, i64 7 + %zv1e7 = zext i8 %v1e7 to i32 + %byte2 = shl i32 %zv1e7, 16 + %v2e6 = extractelement <8 x i8> %vec2, i64 6 + %zv2e6 = zext i8 %v2e6 to i32 + %byte3 = shl i32 %zv2e6, 24 + + %tmp0 = or i32 %zv1e5, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract_v8i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x1070404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 + %v1e4 = extractelement <8 x i8> %vec1, i64 4 + %zv1e4 = zext i8 %v1e4 to i32 + %byte1 = shl i32 %zv1e4, 8 + + %v1e7 = extractelement <8 x i8> %vec1, i64 7 + %zv1e7 = zext i8 %v1e7 to i32 + %byte2 = shl i32 %zv1e7, 16 + %v2e1 = extractelement <8 x i8> %vec1, i64 1 + %zv2e1 = zext i8 %v2e1 to i32 + %byte3 = shl i32 %zv2e1, 24 + + %tmp0 = or i32 %zv1e4, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +; TODO : support this pattern +define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract_3src: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract_3src: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0 +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <8 x i8> %vec1, i64 0 + %zv1e0 = zext i8 %v1e0 to i32 + %byte1 = shl i32 %zv1e0, 8 + + %v1e5 = extractelement <8 x i8> %vec1, i64 5 + %zv1e5 = zext i8 %v1e5 to i32 + %byte2 = shl i32 %zv1e5, 16 + %v2e6 = extractelement <8 x i8> %vec2, i64 6 + %zv2e6 = zext i8 %v2e6 to i32 + %byte3 = shl i32 %zv2e6, 24 + + %tmp0 = or i32 %zv1e0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +