diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/ByteProvider.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -37,6 +38,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/ModRef.h" +#include using namespace llvm; @@ -9721,7 +9723,7 @@ // value 0-3 selects corresponding source byte; // value 0xc selects zero; // value 0xff selects 0xff. -static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { +static uint32_t getPermuteMask(SDValue V) { assert(V.getValueSizeInBits() == 32); if (V.getNumOperands() != 2) @@ -9737,15 +9739,13 @@ default: break; case ISD::AND: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); - } break; case ISD::OR: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ~ConstMask) | ConstMask; - } break; case ISD::SHL: @@ -9904,8 +9904,8 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9951,6 +9951,322 @@ return SDValue(); } +// A key component of v_perm is a mapping between byte position of the src +// operands, and the byte position of the dest. To provide such, we need: 1. the +// node that provides x byte of the dest of the OR, and 2. the byte of the node +// used to provide that x byte. calculateByteProvider finds which node provides +// a certain byte of the dest of the OR, and calculateSrcByte takes that node, +// and finds an ultimate src and byte position For example: The supported +// LoadCombine pattern for vector loads is as follows +// t1 +// or +// / \ +// t2 t3 +// zext shl +// | | \ +// t4 t5 16 +// or anyext +// / \ | +// t6 t7 t8 +// srl shl or +// / | / \ / \ +// t9 t10 t11 t12 t13 t14 +// trunc* 8 trunc* 8 and and +// | | / | | \ +// t15 t16 t17 t18 t19 t20 +// trunc* 255 srl -256 +// | / \ +// t15 t15 16 +// +// *In this example, the truncs are from i32->i16 +// +// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 +// respectively. calculateSrcByte would find (given node) -> ultimate src & +// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. +// After finding the mapping, we can combine the tree into vperm t15, t16, +// 0x05000407 + +// Find the source and byte position from a node. +// \p DestByte is the byte position of the dest of the or that the src +// ultimately provides. \p SrcIndex is the byte of the src that maps to this +// dest of the or byte. \p Depth tracks how many recursive iterations we have +// performed. +static const std::optional> +calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, + unsigned Depth = 0) { + // We may need to recursively traverse a series of SRLs + if (Depth >= 6) + return std::nullopt; + + switch (Op->getOpcode()) { + case ISD::TRUNCATE: { + if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + + if (BitShift % 8 != 0) + return std::nullopt; + + SrcIndex += BitShift / 8; + + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + default: { + if (Op.getScalarValueSizeInBits() != 32) + return std::nullopt; + + return ByteProvider::getSrc(Op, DestByte, SrcIndex); + } + } + llvm_unreachable("fully handled switch"); +} + +// For a byte position in the result of an Or, traverse the tree and find the +// node (and the byte of the node) which ultimately provides this {Or, +// BytePosition}. \p Op is the operand we are currently examining. \p Index is +// the byte position of the Op that corresponds with the originally requested +// byte of the Or \p Depth tracks how many recursive iterations we have +// performed. \p StartingIndex is the originally requested byte of the Or +static const std::optional> +calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, + unsigned StartingIndex = 0) { + // Finding Src tree of RHS of or typically requires at least 1 additional + // depth + if (Depth > 6) + return std::nullopt; + + unsigned BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8 != 0) + return std::nullopt; + assert(Index < BitWidth / 8 && "invalid index requested"); + + switch (Op.getOpcode()) { + case ISD::OR: { + auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, + StartingIndex); + if (!RHS) + return std::nullopt; + auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + if (!LHS) + return std::nullopt; + // A well formed Or will have two ByteProviders for each byte, one of which + // is constant zero + if (!LHS->isConstantZero() && !RHS->isConstantZero()) + return std::nullopt; + if (!LHS || LHS->isConstantZero()) + return RHS; + if (!RHS || RHS->isConstantZero()) + return LHS; + return std::nullopt; + } + + case ISD::AND: { + auto BitMaskOp = dyn_cast(Op->getOperand(1)); + if (!BitMaskOp) + return std::nullopt; + + uint32_t BitMask = BitMaskOp->getZExtValue(); + // Bits we expect for our StartingIndex + uint32_t IndexMask = 0xFF << (Index * 8); + + if ((IndexMask & BitMask) != IndexMask) { + // If the result of the and partially provides the byte, then it + // is not well formatted + if (IndexMask & BitMask) + return std::nullopt; + return ByteProvider::getConstantZero(); + } + + return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8) + return std::nullopt; + + auto BitsProvided = Op.getScalarValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BytesProvided = BitsProvided / 8; + uint64_t ByteShift = BitShift / 8; + // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. + // If the byte we are trying to provide (as tracked by index) falls in this + // range, then the SRL provides the byte. The byte of interest of the src of + // the SRL is Index + ByteShift + return BytesProvided - ByteShift > Index + ? calculateSrcByte(Op->getOperand(0), StartingIndex, + Index + ByteShift) + : ByteProvider::getConstantZero(); + } + + case ISD::SHL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return std::nullopt; + uint64_t ByteShift = BitShift / 8; + + // If we are shifting by an amount greater than (or equal to) + // the index we are trying to provide, then it provides 0s. If not, + // then this bytes are not definitively 0s, and the corresponding byte + // of interest is Index - ByteShift of the src + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op.getOperand(0), Index - ByteShift, + Depth + 1, StartingIndex); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? std::optional>( + ByteProvider::getConstantZero()) + : std::nullopt; + return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); + } + + case ISD::TRUNCATE: { + unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (NarrowByteWidth >= Index) { + return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + } + + return std::nullopt; + } + + case ISD::LOAD: { + auto L = cast(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question + if (Index >= NarrowByteWidth) { + return L->getExtensionType() == ISD::ZEXTLOAD + ? std::optional>( + ByteProvider::getConstantZero()) + : std::nullopt; + } + + if (NarrowByteWidth > Index) { + return calculateSrcByte(Op, StartingIndex, Index); + } + + return std::nullopt; + } + + default: { + return std::nullopt; + } + } + + llvm_unreachable("fully handled switch"); +} + +// Returns true if the Operand is a scalar and is 16 bits +static bool is16BitScalarOp(SDValue &Operand) { + switch (Operand.getOpcode()) { + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + auto OpVT = Operand.getOperand(0).getValueType(); + return !OpVT.isVector() && OpVT.getSizeInBits() == 16; + } + case ISD::LOAD: { + LoadSDNode *L = cast(Operand.getNode()); + auto ExtType = cast(L)->getExtensionType(); + if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || + ExtType == ISD::EXTLOAD) { + auto MemVT = L->getMemoryVT(); + return !MemVT.isVector() && MemVT.getSizeInBits() == 16; + } + return false; + } + default: + return false; + } +} + +// Returns true if the mask matches consecutive bytes, and the first byte +// begins at a power of 2 byte offset from 0th byte +static bool addresses16Bits(int Mask) { + int Low8 = Mask & 0xff; + int Hi8 = (Mask & 0xff00) >> 8; + + assert(Low8 < 8 && Hi8 < 8); + // Are the bytes contiguous in the order of increasing addresses. + bool IsConsecutive = (Hi8 - Low8 == 1); + // Is the first byte at location that is aligned for 16 bit instructions. + // A counter example is taking 2 consecutive bytes starting at the 8th bit. + // In this case, we still need code to extract the 16 bit operand, so it + // is better to use i8 v_perm + bool Is16Aligned = !(Low8 % 2); + + return IsConsecutive && Is16Aligned; +} + +// Do not lower into v_perm if the operands are actually 16 bit +// and the selected bits (based on PermMask) correspond with two +// easily addressable 16 bit operands. +static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, + SDValue &OtherOp) { + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; + + // ByteProvider only accepts 32 bit operands + assert(Op.getValueType().getSizeInBits() == 32); + assert(OtherOp.getValueType().getSizeInBits() == 32); + + auto OpIs16Bit = is16BitScalarOp(Op); + auto OtherOpIs16Bit = is16BitScalarOp(Op); + + // If there is a size mismatch, then we must use masking on at least one + // operand + if (OpIs16Bit != OtherOpIs16Bit) + return true; + + // If both operands are 16 bit, return whether or not we cleanly address both + if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) + return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + + // Both are 32 bit operands + return true; +} + SDValue SITargetLowering::performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -10001,8 +10317,36 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + + // If all the uses of an or need to extract the individual elements, do not + // attempt to lower into v_perm + auto usesCombinedOperand = [](SDNode *OrUse) { + // If we have any non-vectorized use, then it is a candidate for v_perm + if (OrUse->getOpcode() != ISD::BITCAST || + !OrUse->getValueType(0).isVector()) + return true; + + // If we have any non-vectorized use, then it is a candidate for v_perm + for (auto VUse : OrUse->uses()) { + if (!VUse->getValueType(0).isVector()) + return true; + + // If the use of a vector is a store, then combining via a v_perm + // is beneficial. + // TODO -- whitelist more uses + for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) + if (VUse->getOpcode() == VectorwiseOp) + return true; + } + return false; + }; + + if (!any_of(N->uses(), usesCombinedOperand)) + return SDValue(); + + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -10035,6 +10379,71 @@ DAG.getConstant(Sel, DL, MVT::i32)); } } + if (LHSMask == ~0u || RHSMask == ~0u) { + SmallVector, 8> PermNodes; + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + assert(VT == MVT::i32); + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + std::optional SecondSrc; + uint64_t permMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc.has_value()) + if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + return SDValue(); + // Set the index of the second distinct Src node + SecondSrc = i; + assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == + 32); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDValue Op = *PermNodes[FirstSrc].Src; + SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src + : *PermNodes[FirstSrc].Src; + + // Check that we are not just extracting the bytes in order from an op + if (Op == OtherOp) { + int Low16 = permMask & 0xffff; + int Hi16 = (permMask & 0xffff0000) >> 16; + + bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); + bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + + // The perm op would really just produce Op. So combine into Op + if (WellFormedLow && WellFormedHi) + return Op; + } + + if (hasEightBitAccesses(permMask, Op, OtherOp)) { + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, + DAG.getConstant(permMask, DL, MVT::i32)); + } + } } if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -42,16 +42,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x6050400 +; GCN-NEXT: s_mov_b32 s0, 0x7050604 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v3, v2, 16, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v2 -; GCN-NEXT: v_perm_b32 v3, v3, v2, s0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; GCN-NEXT: v_or3_b32 v2, v3, v4, v2 +; GCN-NEXT: v_perm_b32 v2, v2, v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -238,9 +233,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: global_load_dword v3, v[0:1], off offset:4 -; GCN-NEXT: s_mov_b32 s4, 0xffff0000 +; GCN-NEXT: s_mov_b32 s4, 0x3020504 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_or_b32 v0, v3, s4, v2 +; GCN-NEXT: v_perm_b32 v0, v2, v3, s4 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(1) %p, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1426,6 +1426,178 @@ ret void } +; The other use of shuffle0_0 make it profitable to lower into v_perm + +define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out1, <4 x i8> addrspace(1)* noalias %in, <4 x i8> addrspace(1)* noalias %in1) nounwind { +; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v4, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: v_alignbit_b32 v5, v3, v5, 24 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ubyte v6, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[0:1] +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_perm_b32 v4, v4, v5, s8 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v7, v4, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x4000405 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4 +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dword v5, v4, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 +; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3 +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in1, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 + %load1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + ; FIXME: Need to handle non-uniform case for function below (load without gep). ; Instructions still emitted to repack bytes for add use. define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -241,20 +241,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align1: @@ -263,8 +257,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -273,8 +265,6 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align1: @@ -283,9 +273,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -294,9 +281,6 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1 %p.0 = load i16, ptr addrspace(5) %p, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1280,10 +1280,9 @@ ; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x3020706 +; GFX9-NEXT: v_perm_b32 v2, v2, v3, s4 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1291,13 +1290,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_perm_b32 v2, v2, v3, 0x3020706 +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -755,10 +755,10 @@ ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3020706 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 +; VI-NEXT: v_perm_b32 v2, s4, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1594,6 +1594,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1601,11 +1602,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 +; VI-NEXT: v_perm_b32 v0, s4, v0, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1751,6 +1750,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1758,11 +1758,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1908,6 +1906,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1915,11 +1914,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -2689,6 +2686,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 @@ -2701,11 +2699,9 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfi_b32 v3, s2, v12, v3 +; VI-NEXT: v_perm_b32 v3, s4, v3, v12 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -221,9 +221,9 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -682,12 +682,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_write_b16 v3, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -794,9 +794,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -836,9 +836,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1050,9 +1050,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1089,9 +1089,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1292,9 +1292,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1394,9 +1394,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1444,8 +1444,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1493,8 +1493,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1542,8 +1542,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1843,9 +1843,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1885,9 +1885,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2026,8 +2026,8 @@ ; GFX803-NEXT: v_mov_b32_e32 v2, 44 ; GFX803-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -191,8 +191,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_mov_b32 s0, 0x1000504 +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 ; GFX8-NEXT: ;;#ASMEND @@ -271,10 +271,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -187,8 +187,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_mov_b32 s0, 0x1000504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ; use v0 ; GFX803-NEXT: ;;#ASMEND @@ -265,10 +265,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s0, 0x1000504 ; GFX803-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -0,0 +1,2811 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 + +define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle6766: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle6766: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x6060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle3744: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3744: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0x307 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle4445: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle4445: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle0101: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0101: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle1004: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle1004: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x40405 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + + + +define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { +; GFX10-LABEL: shuffle7533: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7533: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x3030507 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 + ret void +} + +define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { +; GFX10-LABEL: shuffle7767: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7767: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x7060707 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 + ret void +} + +define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { +; GFX10-LABEL: shuffle0554: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ds_read_b32 v1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104 +; GFX10-NEXT: ds_write_b32 v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0554: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ds_read_b32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x10104 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: ds_write_b32 v2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 + ret void +} + +define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { +; GFX10-LABEL: shuffle2127: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ds_read_b32 v1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506 +; GFX10-NEXT: ds_write_b32 v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle2127: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ds_read_b32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x3060506 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: ds_write_b32 v2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 + ret void +} + +define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) { +; GFX10-LABEL: shuffle5047: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005 +; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle5047: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; GFX9-NEXT: s_mov_b32 s4, 0x7040005 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4 + ret void +} + +define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle3546: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3546: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x2000107 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + + +define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle7330ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7330ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x4070706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle5341ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle5341ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle6106ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle6106ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + + +define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle4327ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle4327ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle3263ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3263ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060607 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle2763ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle2763ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle1327ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle1327ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060705 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { +; GFX10-LABEL: shuffle0605ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0605ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { +; GFX10-LABEL: insertUsesOr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: insertUsesOr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + ret void +} + +define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { +; GFX10-LABEL: addUsesOr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 +; GFX10-NEXT: v_add_nc_u16 v2, v2, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_add_nc_u16 v1, v4, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addUsesOr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %added = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %added, ptr addrspace(1) %out0 + ret void +} + + +define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 { +; GFX10-LABEL: shuffle8i8: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: s_lshr_b32 s4, s9, 16 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 +; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: shuffle8i8: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 +; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s1, s9, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 +; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1 +; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm +bb: + %vec0 = load <8 x i8>, ptr addrspace(1) %in0 + %vec1 = load <8 x i8>, ptr addrspace(1) %in1 + %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> + store <8 x i8> %shuffle0, ptr addrspace(1) %out1 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone + +; Not combined to perm due to non-vectorized use, non-divergent +define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { +; GFX10-LABEL: add: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v7 +; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v2, v7, v2 +; GFX10-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-NEXT: v_add_nc_u16 v1, v1, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + ret void +} + +; Not combined to perm due to non-vectorized use +define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { +; GFX10-LABEL: add_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_add_nc_u16 v1, v1, v7 +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %vecins = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + ret void +} + +; Not combined to perm due to non-divergent use +define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: add_store: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 +; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 +; GFX10-NEXT: v_add_nc_u16 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_store: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_e32 v3, v0, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %vecins = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +; Not combined to perm due to 16 bit or +define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: add_store_div_16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 +; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 +; GFX10-NEXT: v_add_nc_u16 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_store_div_16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %vecins = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +; Vectorized use, divergent, 32 bit or +define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: add_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v9 +; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v2, v9, v2 +; GFX10-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-NEXT: v_add_nc_u16 v1, v1, v10 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: add_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x10705 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 +; GFX9-NEXT: v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: and_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x102 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v2, 0x100, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x5070006 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: and_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 2 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: s_movk_i32 s5, 0x102 +; GFX9-NEXT: s_mov_b32 s4, 0x5070006 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v2, 0x100, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = and <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: ashr_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 26 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9 +; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1 +; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0 +; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: ashr_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 26 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 +; GFX9-NEXT: s_mov_b32 s4, 0x4010707 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = ashr <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: bc_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x7060104 +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bc_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060104 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = bitcast <4 x i8> %shuffle0_0 to i32 + store i32 %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +} + + +define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; GFX10-LABEL: eve_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x1020305 +; GFX10-NEXT: global_store_byte v[9:10], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: eve_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x1020305 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 +; GFX9-NEXT: global_store_byte v[9:10], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %tmp = extractelement <4 x i8> %shuffle0_0, i32 1 + store i8 %tmp, ptr addrspace(1) %out2 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +; Not combined to perm due to multi use of or operands (introduced by insert op) +define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: ive_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v9, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: global_load_dword v10, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xff +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v10, 16 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: ive_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v9, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 2, v9 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v10, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_alignbit_b32 v2, v1, v10, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: lhsr_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 26 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lhsr_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 26 +; GFX9-NEXT: s_mov_b32 s4, 0x1030707 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v3, 1, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 25, v9 +; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 26, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = lshr <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: mul_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b16 v0, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v9 +; GFX10-NEXT: v_lshrrev_b16 v2, 8, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2 +; GFX10-NEXT: v_mul_lo_u16 v1, v3, v1 +; GFX10-NEXT: v_mul_lo_u16 v2, v4, v9 +; GFX10-NEXT: v_mul_lo_u16 v3, v9, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2000504 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: mul_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x2000504 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_lo_u16_e32 v2, v9, v4 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, v4, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = mul <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: or_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 16 +; GFX10-NEXT: v_mov_b32_e32 v2, 0x102 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: or_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x2010005 +; GFX9-NEXT: s_movk_i32 s5, 0x102 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v4, v0, v2, s4 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, 0x201, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = or <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + +define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: sdiv_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_i32 v0, v4, 0, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 24, v9 +; GFX10-NEXT: v_bfe_i32 v3, v4, 8, 8 +; GFX10-NEXT: v_bfe_i32 v1, v9, 16, 8 +; GFX10-NEXT: v_bfe_i32 v10, v4, 16, 8 +; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 24, v4 +; GFX10-NEXT: v_xor_b32_e32 v15, v2, v3 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX10-NEXT: v_xor_b32_e32 v12, v1, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v13 +; GFX10-NEXT: v_cvt_f32_i32_e32 v14, v1 +; GFX10-NEXT: v_xor_b32_e32 v1, v1, v10 +; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v3 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v11 +; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 +; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v10 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX10-NEXT: v_mul_f32_e32 v16, v14, v16 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 +; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 +; GFX10-NEXT: v_trunc_f32_e32 v16, v16 +; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 +; GFX10-NEXT: v_or_b32_e32 v15, 1, v15 +; GFX10-NEXT: v_mul_f32_e32 v18, v14, v18 +; GFX10-NEXT: v_trunc_f32_e32 v17, v17 +; GFX10-NEXT: v_mad_f32 v20, -v16, v13, v14 +; GFX10-NEXT: v_mul_f32_e32 v19, v13, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_mad_f32 v2, -v17, v3, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13| +; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_mad_f32 v14, -v18, v10, v14 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3| +; GFX10-NEXT: v_mad_f32 v21, -v19, v11, v13 +; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 +; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v15, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10| +; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v16, v12 +; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11| +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sdiv_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x60706 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_bfe_i32 v2, v9, 16, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 24, v9 +; GFX9-NEXT: v_bfe_i32 v9, v4, 8, 8 +; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v1 +; GFX9-NEXT: v_bfe_i32 v10, v4, 16, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 24, v4 +; GFX9-NEXT: v_xor_b32_e32 v14, v3, v9 +; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9 +; GFX9-NEXT: v_xor_b32_e32 v11, v2, v1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v13, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v10 +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v12 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 +; GFX9-NEXT: v_mul_f32_e32 v15, v13, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX9-NEXT: v_trunc_f32_e32 v15, v15 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX9-NEXT: v_mul_f32_e32 v17, v13, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX9-NEXT: v_trunc_f32_e32 v16, v16 +; GFX9-NEXT: v_mad_f32 v19, -v15, v12, v13 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX9-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_trunc_f32_e32 v17, v17 +; GFX9-NEXT: v_trunc_f32_e32 v18, v18 +; GFX9-NEXT: v_mad_f32 v3, -v16, v9, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v12| +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 +; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX9-NEXT: v_mad_f32 v13, -v17, v10, v13 +; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 +; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v12 +; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9| +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v10| +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v4, v15, v11 +; GFX9-NEXT: v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v2, v17, v2 +; GFX9-NEXT: v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = sdiv <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: sext_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX10-NEXT: v_ashrrev_i16 v2, 8, v4 +; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX10-NEXT: v_ashrrev_i16 v3, 8, v1 +; GFX10-NEXT: v_perm_b32 v1, v0, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v0, v3, v3, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v9, v4, 0x3010707 +; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off +; GFX10-NEXT: global_store_dword v[5:6], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sext_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_mov_b32 s5, 0x5040100 +; GFX9-NEXT: s_mov_b32 s4, 0x3010707 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9 +; GFX9-NEXT: v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s5 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s5 +; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 +; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off +; GFX9-NEXT: global_store_dword v[5:6], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16> + store <4 x i16> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +} + + +define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: shl_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b16 v0, 2, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v9 +; GFX10-NEXT: v_and_b32_e32 v2, 0xfffffc00, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 0xfe, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xfffffe00, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xfc, v0 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5000104 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shl_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5000104 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 2, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 1, v9 +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xfffffc00, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xfe, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffe00, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xfc, v1 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = shl <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: sitofp_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX10-NEXT: v_ashrrev_i16 v2, 8, v9 +; GFX10-NEXT: v_ashrrev_i16 v3, 8, v4 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x6010205 +; GFX10-NEXT: v_bfe_i32 v10, v0, 0, 8 +; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sitofp_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x6010205 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9 +; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v3, 8, v4 +; GFX9-NEXT: v_bfe_i32 v11, v2, 0, 8 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_perm_b32 v4, v4, v9, s4 +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: global_store_dword v[5:6], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +} + + +define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: srem_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX10-NEXT: v_bfe_i32 v2, v4, 16, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v10, 24, v9 +; GFX10-NEXT: v_bfe_i32 v11, v4, 8, 8 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 24, v4 +; GFX10-NEXT: v_bfe_i32 v13, v9, 16, 8 +; GFX10-NEXT: v_xor_b32_e32 v14, v2, v1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX10-NEXT: v_xor_b32_e32 v16, v10, v11 +; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 +; GFX10-NEXT: v_cvt_f32_i32_e32 v15, v2 +; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v17, v12 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 +; GFX10-NEXT: v_xor_b32_e32 v2, v12, v2 +; GFX10-NEXT: v_xor_b32_e32 v12, v13, v12 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v21, v17 +; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v13 +; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX10-NEXT: v_mul_f32_e32 v18, v15, v18 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX10-NEXT: v_mul_f32_e32 v19, v10, v19 +; GFX10-NEXT: v_mul_f32_e32 v20, v17, v20 +; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_mul_f32_e32 v21, v13, v21 +; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_trunc_f32_e32 v20, v20 +; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX10-NEXT: v_mad_f32 v22, -v18, v1, v15 +; GFX10-NEXT: v_trunc_f32_e32 v21, v21 +; GFX10-NEXT: v_mad_f32 v10, -v19, v11, v10 +; GFX10-NEXT: v_mad_f32 v23, -v20, v15, v17 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1| +; GFX10-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_mad_f32 v13, -v21, v17, v13 +; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 +; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v14, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11| +; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 +; GFX10-NEXT: v_cvt_i32_f32_e32 v21, v21 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15| +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v10, v19, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17| +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX10-NEXT: v_mul_lo_u32 v3, v10, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v20, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v21, v11 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: srem_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x2070306 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 8 +; GFX9-NEXT: v_bfe_i32 v3, v4, 16, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 24, v9 +; GFX9-NEXT: v_bfe_i32 v12, v4, 8, 8 +; GFX9-NEXT: v_xor_b32_e32 v16, v3, v2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 24, v4 +; GFX9-NEXT: v_xor_b32_e32 v18, v11, v12 +; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v12 +; GFX9-NEXT: v_cvt_f32_i32_e32 v17, v3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v19, v13 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v2 +; GFX9-NEXT: v_bfe_i32 v15, v9, 16, 8 +; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v11 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v12 +; GFX9-NEXT: v_xor_b32_e32 v3, v13, v3 +; GFX9-NEXT: v_xor_b32_e32 v13, v15, v13 +; GFX9-NEXT: v_cvt_f32_i32_e32 v15, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v17 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v19 +; GFX9-NEXT: v_mul_f32_e32 v20, v17, v20 +; GFX9-NEXT: v_mul_f32_e32 v21, v11, v21 +; GFX9-NEXT: v_trunc_f32_e32 v20, v20 +; GFX9-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX9-NEXT: v_mul_f32_e32 v22, v19, v22 +; GFX9-NEXT: v_mul_f32_e32 v23, v15, v23 +; GFX9-NEXT: v_trunc_f32_e32 v21, v21 +; GFX9-NEXT: v_mad_f32 v24, -v20, v2, v17 +; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX9-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_trunc_f32_e32 v22, v22 +; GFX9-NEXT: v_trunc_f32_e32 v23, v23 +; GFX9-NEXT: v_mad_f32 v11, -v21, v12, v11 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v2| +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20 +; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21 +; GFX9-NEXT: v_mad_f32 v25, -v22, v17, v19 +; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22 +; GFX9-NEXT: v_mad_f32 v15, -v23, v19, v15 +; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| +; GFX9-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX9-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v18, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v17| +; GFX9-NEXT: v_or_b32_e32 v13, 1, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, |v19| +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v2, v20, v2 +; GFX9-NEXT: v_add_u32_e32 v11, v21, v11 +; GFX9-NEXT: v_add_u32_e32 v3, v22, v3 +; GFX9-NEXT: v_add_u32_e32 v12, v23, v12 +; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v11, v10 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v14 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v3, v14, v3 +; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = srem <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: sub_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u16 v3, v0, v3 +; GFX10-NEXT: v_sub_nc_u16 v9, v1, v4 +; GFX10-NEXT: v_sub_nc_u16 v10, v4, v2 +; GFX10-NEXT: v_sub_nc_u16 v1, v4, v1 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_lshlrev_b16 v4, 8, v9 +; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sub_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x6070007 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX9-NEXT: v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v4, v2, v0, s4 +; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_sub_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = sub <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { +; GFX10-LABEL: sv_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v5, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x50705 +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: sv_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v5, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x50705 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4 +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: trunc_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 1 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX10-NEXT: global_store_byte v[7:8], v0, off +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: trunc_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_mov_b32 s4, 0x50205 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 3, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 2, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: global_store_byte v[7:8], v0, off +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1> + store <4 x i1> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +} + +define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: udiv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 +; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207 +; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11 +; GFX10-NEXT: v_mul_f32_e32 v13, v1, v13 +; GFX10-NEXT: v_mul_f32_e32 v12, v15, v12 +; GFX10-NEXT: v_trunc_f32_e32 v10, v10 +; GFX10-NEXT: v_trunc_f32_e32 v11, v11 +; GFX10-NEXT: v_trunc_f32_e32 v13, v13 +; GFX10-NEXT: v_trunc_f32_e32 v12, v12 +; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX10-NEXT: v_mad_f32 v16, -v11, v3, v4 +; GFX10-NEXT: v_mad_f32 v17, -v13, v9, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15 +; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, v9 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v4 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: udiv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x40207 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 +; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4 +; GFX9-NEXT: v_trunc_f32_e32 v11, v11 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4 +; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 +; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9 +; GFX9-NEXT: v_trunc_f32_e32 v12, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13 +; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10 +; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX9-NEXT: v_trunc_f32_e32 v13, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 +; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 +; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3 +; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = udiv <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: uitofp_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v9 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v9 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5020104 +; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX10-NEXT: global_store_dword v[5:6], v4, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: uitofp_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5020104 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 +; GFX9-NEXT: v_perm_b32 v10, v9, v4, s4 +; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: global_store_dword v[5:6], v10, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +} + + +define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: urem_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v2, v[2:3], off +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 +; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2 +; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10 +; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11 +; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 +; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13 +; GFX10-NEXT: v_trunc_f32_e32 v10, v10 +; GFX10-NEXT: v_trunc_f32_e32 v11, v11 +; GFX10-NEXT: v_trunc_f32_e32 v12, v12 +; GFX10-NEXT: v_trunc_f32_e32 v13, v13 +; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 +; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15 +; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14 +; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4 +; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 +; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dword v[5:6], v1, off +; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: urem_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[2:3], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x2050505 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11 +; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4 +; GFX9-NEXT: v_trunc_f32_e32 v15, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 +; GFX9-NEXT: v_trunc_f32_e32 v16, v16 +; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 +; GFX9-NEXT: v_mad_f32 v20, -v16, v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9 +; GFX9-NEXT: v_trunc_f32_e32 v17, v17 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 +; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 +; GFX9-NEXT: v_mad_f32 v21, -v17, v11, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v15, vcc +; GFX9-NEXT: v_trunc_f32_e32 v18, v18 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, v3 +; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 +; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v16, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v21|, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v17, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v18, vcc +; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v10 +; GFX9-NEXT: v_mul_lo_u32 v0, v11, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, v13, v12 +; GFX9-NEXT: v_sub_u32_e32 v2, v10, v2 +; GFX9-NEXT: v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0 +; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = urem <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: xor_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00 +; GFX10-NEXT: v_mov_b32_e32 v1, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9 +; GFX10-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0 +; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: xor_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0xff00 +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: s_mov_b32 s5, 0x5060307 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9 +; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v9, v4, s5 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %vecins = xor <4 x i8> %shuffle0_0, + store <4 x i8> %vecins, ptr addrspace(1) %out0 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 + ret void +} + + +define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { +; GFX10-LABEL: zext_store_div: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0xff +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX10-NEXT: v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x5040100 +; GFX10-NEXT: v_perm_b32 v2, v4, v9, 0x60504 +; GFX10-NEXT: v_perm_b32 v1, v3, v10, 0x5040100 +; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off +; GFX10-NEXT: global_store_dword v[5:6], v2, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: zext_store_div: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x60504 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v9 +; GFX9-NEXT: v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v4, s6 +; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off +; GFX9-NEXT: global_store_dword v[5:6], v2, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid + %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid + %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 + %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16> + store <4 x i16> %insvec, ptr addrspace(1) %out1 + store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 + ret void +}