diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17,6 +17,7 @@ #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/ByteProvider.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" @@ -35,6 +36,7 @@ #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" +#include using namespace llvm; @@ -9534,7 +9536,7 @@ // value 0-3 selects corresponding source byte; // value 0xc selects zero; // value 0xff selects 0xff. -static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { +static uint32_t getPermuteMask(SDValue V) { assert(V.getValueSizeInBits() == 32); if (V.getNumOperands() != 2) @@ -9550,15 +9552,13 @@ default: break; case ISD::AND: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); - } break; case ISD::OR: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ~ConstMask) | ConstMask; - } break; case ISD::SHL: @@ -9716,8 +9716,8 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9763,6 +9763,246 @@ return SDValue(); } +// A key component of v_perm is a mapping between byte position of the src +// operands, and the byte position of the dest. To provide such, we need: 1. the +// node that provides x byte of the dest of the OR, and 2. the byte of the node +// used to provide that x byte. calculateByteProvider finds which node provides +// a certain byte of the dest of the OR, and calculateSrcByte takes that node, +// and finds an ultimate src and byte position For example: The supported +// LoadCombine pattern for vector loads is as follows +// t1 +// or +// / \ +// t2 t3 +// zext shl +// | | \ +// t4 t5 16 +// or anyext +// / \ | +// t6 t7 t8 +// srl shl or +// / | / \ / \ +// t9 t10 t11 t12 t13 t14 +// trunc* 8 trunc* 8 and and +// | | / | | \ +// t15 t16 t17 t18 t19 t20 +// trunc* 255 srl -256 +// | / \ +// t15 t15 16 +// +// *In this example, the truncs are from i32->i16 +// +// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 +// respectively. calculateSrcByte would find (given node) -> ultimate src & +// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. +// After finding the mapping, we can combine the tree into vperm t15, t16, +// 0x05000407 + +// Find the source and byte position from a node. +// \p DestByte is the byte position of the dest of the or that the src +// ultimately provides. \p SrcIndex is the byte of the src that maps to this +// dest of the or byte. \p Depth tracks how many recursive iterations we have +// performed. +static const std::optional> +calculateSrcByte(const SDValue &Op, uint64_t DestByte, uint64_t SrcIndex = 0, + unsigned Depth = 0) { + // We may need to recursively traverse a series of SRLs + if (Depth >= 6) + return std::nullopt; + + switch (Op->getOpcode()) { + case ISD::TRUNCATE: { + if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + + if (BitShift % 8 != 0) + return std::nullopt; + + SrcIndex += BitShift / 8; + + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + default: { + if (Op.getScalarValueSizeInBits() != 32) + return std::nullopt; + + return ByteProvider::getSrc(Op, DestByte, SrcIndex); + } + } + llvm_unreachable("fully handled switch"); +} + +// For a byte position in the result of an Or, traverse the tree and find the +// node (and the byte of the node) which ultimately provides this {Or, +// BytePosition}. \p Op is the operand we are currently examining. \p Index is +// the byte position of the Op that corresponds with the originally requested +// byte of the Or \p Depth tracks how many recursive iterations we have +// performed. \p StartingIndex is the originally requested byte of the Or +static const std::optional> +calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, + unsigned StartingIndex = 0) { + // Finding Src tree of RHS of or typically requires at least 1 additional + // depth + if (Depth >= 8) + return std::nullopt; + + unsigned BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8 != 0) + return std::nullopt; + assert(Index < BitWidth / 8 && "invalid index requested"); + + switch (Op.getOpcode()) { + case ISD::OR: { + auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, + StartingIndex); + // A well formed Or will only have nonzero bytes for one operand + if (LHS && RHS && !LHS->isConstantZero() && !RHS->isConstantZero()) + return std::nullopt; + if (!LHS || LHS->isConstantZero()) + return RHS; + if (!RHS || RHS->isConstantZero()) + return LHS; + return std::nullopt; + } + + case ISD::AND: { + auto BitMaskOp = dyn_cast(Op->getOperand(1)); + if (!BitMaskOp) + return std::nullopt; + + uint32_t BitMask = BitMaskOp->getZExtValue(); + // Bits we expect for our StartingIndex + uint32_t IndexMask = 0xFF << (Index * 8); + + if ((IndexMask & BitMask) != IndexMask) { + // If the result of the and partially provides the byte, then it + // is not well formatted + if (IndexMask & BitMask) + return std::nullopt; + return ByteProvider::getConstantZero(); + } + + return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8) + return std::nullopt; + + auto BitsProvided = Op.getScalarValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BytesProvided = BitsProvided / 8; + uint64_t ByteShift = BitShift / 8; + // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. + // If the byte we are trying to provide (as tracked by index) falls in this + // range, then the SRL provides the byte. The byte of interest of the src of + // the SRL is Index + ByteShift + return BytesProvided - ByteShift > Index + ? calculateSrcByte(Op->getOperand(0), StartingIndex, + Index + ByteShift) + : ByteProvider::getConstantZero(); + } + + case ISD::SHL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return std::nullopt; + uint64_t ByteShift = BitShift / 8; + + // If we are shifting by an amount greater than (or equal to) + // the index we are trying to provide, then it provides 0s. If not, + // then this bytes are not definitively 0s, and the corresponding byte + // of interest is Index - ByteShift of the src + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op.getOperand(0), Index - ByteShift, + Depth + 1, StartingIndex); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? std::optional>( + ByteProvider::getConstantZero()) + : std::nullopt; + return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); + } + + case ISD::TRUNCATE: { + unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (NarrowByteWidth >= Index) { + return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + } + + return std::nullopt; + } + + case ISD::LOAD: { + auto L = cast(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question + if (Index >= NarrowByteWidth) + return L->getExtensionType() == ISD::ZEXTLOAD + ? std::optional>( + ByteProvider::getConstantZero()) + : std::nullopt; + + if (NarrowByteWidth > Index) { + return calculateSrcByte(Op, StartingIndex, Index); + } + + return std::nullopt; + } + + default: { + return std::nullopt; + } + } + + llvm_unreachable("fully handled switch"); +} + SDValue SITargetLowering::performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -9813,8 +10053,40 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + + bool IsCombineVectorized = false; + for (auto OrUse : N->uses()) { + // Only special case bitcast to vectors + if (OrUse->getOpcode() != ISD::BITCAST || + !OrUse->getValueType(0).isVector()) { + IsCombineVectorized = true; + continue; + } + + // If all the users of the or bitcasted to vector are not known to be + // vectorized ops, then conservatively do not attempt to combine. + for (auto VUse : OrUse->uses()) { + if (VUse->getOpcode() == ISD::BITCAST || + !VUse->getValueType(0).isVector()) { + IsCombineVectorized = true; + break; + } + + // TODO -- less conservaite conditions for IsCombineVectorized + for (auto VectorwiseOp : {ISD::STORE}) + if (VUse->getOpcode() == VectorwiseOp) { + IsCombineVectorized = true; + break; + } + } + } + + if (!IsCombineVectorized) + return SDValue(); + + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9841,12 +10113,61 @@ // Combine masks uint32_t Sel = LHSMask | RHSMask; SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), RHS.getOperand(0), DAG.getConstant(Sel, DL, MVT::i32)); } } + if (LHSMask == ~0u || RHSMask == ~0u) { + SmallVector, 8> PermNodes; + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + assert(VT == MVT::i32); + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + int SecondSrc = -1; + uint64_t permMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc != -1) + if (!PermOp.hasSameSrc(PermNodes[SecondSrc])) + return SDValue(); + // Set the index of the second distinct Src node + SecondSrc = i; + assert(PermNodes[SecondSrc].Src->getValueType().getSizeInBits() == + 32); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + *PermNodes[FirstSrc].Src, + SecondSrc == -1 ? *PermNodes[FirstSrc].Src + : *PermNodes[SecondSrc].Src, + DAG.getConstant(permMask, DL, MVT::i32)); + } } if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -42,16 +42,11 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x6050400 +; GCN-NEXT: s_mov_b32 s0, 0x7050604 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v3, v2, 16, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v2 -; GCN-NEXT: v_perm_b32 v3, v3, v2, s0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; GCN-NEXT: v_or3_b32 v2, v3, v4, v2 +; GCN-NEXT: v_perm_b32 v2, v2, v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -96,8 +91,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: global_load_ushort v3, v[0:1], off offset:4 +; GCN-NEXT: s_mov_b32 s4, 0x1000504 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GCN-NEXT: v_perm_b32 v0, v2, v3, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 2 %p.0 = load i16, i16 addrspace(1)* %p, align 4 @@ -238,9 +234,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: global_load_dword v3, v[0:1], off offset:4 -; GCN-NEXT: s_mov_b32 s4, 0xffff0000 +; GCN-NEXT: s_mov_b32 s4, 0x3020504 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_or_b32 v0, v3, s4, v2 +; GCN-NEXT: v_perm_b32 v0, v2, v3, s4 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1293,7 +1293,6 @@ ; This should not be adding instructions to shift into the correct ; position in the word for the component. -; FIXME: Packing bytes define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: @@ -1426,6 +1425,179 @@ ret void } +; The other use of shuffle0_0 make it profitable to lower into v_perm + +define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out1, <4 x i8> addrspace(1)* noalias %in, <4 x i8> addrspace(1)* noalias %in1) nounwind { +; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v4, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; SI-NEXT: v_alignbit_b32 v5, v3, v5, 24 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ubyte v6, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[0:1] +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_perm_b32 v4, v4, v5, s8 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v7, v4, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x4000405 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4 +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dword v5, v4, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 +; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3 +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid + %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in1, i32 %tid + %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 + %load1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1, align 1 + %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> + %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16 + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out1, align 4 + ret void +} + + ; FIXME: Need to handle non-uniform case for function below (load without gep). ; Instructions still emitted to repack bytes for add use. define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { @@ -2661,10 +2833,11 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s0, 0x7060504 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; VI-NEXT: v_perm_b32 v0, v0, v0, s0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2678,7 +2851,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: global_store_dword v2, v0, s[2:3] @@ -2691,8 +2864,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0x7060504 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] @@ -2706,7 +2880,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -564,6 +564,7 @@ ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 @@ -574,16 +575,15 @@ ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v4, s0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm @@ -657,6 +657,7 @@ ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 @@ -667,16 +668,15 @@ ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v4, s0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm @@ -746,10 +746,11 @@ ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 +; GFX9-ALIGNED-NEXT: s_mov_b32 s2, 0x1000504 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v3, s2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v4, v1, s2 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -38,8 +38,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v0, v2, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align2: @@ -50,7 +51,7 @@ ; GFX10-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v2, v3, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align2: @@ -61,7 +62,7 @@ ; GFX11-NEXT: global_load_u16 v2, v[0:1], off ; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -39,8 +39,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -48,8 +49,9 @@ ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0x1000504 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v0, v1, v2, s0 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align2: @@ -60,7 +62,7 @@ ; GFX10-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -71,7 +73,7 @@ ; GFX10-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off ; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align2: @@ -82,7 +84,7 @@ ; GFX11-NEXT: scratch_load_u16 v1, v0, off ; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -93,7 +95,7 @@ ; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v0, off ; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v0, v1, v0, 0x1000504 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 @@ -235,20 +237,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_mov_b32 s4, 0x7060504 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0x7060504 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v0, v0, v0, s0 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align1: @@ -257,8 +257,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -267,8 +266,7 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align1: @@ -277,9 +275,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -288,9 +284,7 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v0, v0, v0, 0x7060504 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -755,10 +755,10 @@ ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3020706 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 +; VI-NEXT: v_perm_b32 v2, s4, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1594,6 +1594,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1601,11 +1602,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 +; VI-NEXT: v_perm_b32 v0, s4, v0, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1751,6 +1750,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1758,11 +1758,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1908,6 +1906,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1915,11 +1914,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -2689,6 +2686,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: v_mov_b32_e32 v12, 0x3020504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 @@ -2701,11 +2699,9 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfi_b32 v3, s2, v12, v3 +; VI-NEXT: v_perm_b32 v3, s4, v3, v12 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -2258,9 +2258,9 @@ ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v1, v0 ; GFX803-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain: @@ -2306,12 +2306,11 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: ds_read_u16 v1, v0 offset:16 -; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: ds_read_u16 v1, v0 +; GFX803-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain: @@ -2411,9 +2410,9 @@ ; GFX803-NEXT: ds_read_u16 v2, v0 ; GFX803-NEXT: ds_write_b16 v1, v3 ; GFX803-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect: @@ -2469,8 +2468,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_global_v2i16_split: @@ -2523,9 +2522,10 @@ ; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 +; GFX803-NEXT: s_waitcnt lgkmcnt(0) +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_flat_v2i16_split: @@ -2575,9 +2575,9 @@ ; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_constant_v2i16_split: @@ -2628,8 +2628,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x1000504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_private_v2i16_split: diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -221,9 +221,9 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -682,12 +682,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_write_b16 v3, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -794,9 +794,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -836,9 +836,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1050,9 +1050,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1089,9 +1089,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1292,9 +1292,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1394,9 +1394,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1444,8 +1444,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1493,8 +1493,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1542,8 +1542,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1843,9 +1843,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1885,9 +1885,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2026,8 +2026,8 @@ ; GFX803-NEXT: v_mov_b32_e32 v2, 44 ; GFX803-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x3020504 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -71,26 +71,23 @@ ; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 ; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 ; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 -; GFX9-NEXT: s_waitcnt lgkmcnt(14) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(12) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(10) -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 8, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v8, 8, v7 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 +; GFX9-NEXT: v_lshl_or_b32 v2, v12, 8, v11 +; GFX9-NEXT: v_lshl_or_b32 v3, v10, 8, v9 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v16, 8, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v14, 8, v13 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align1: @@ -237,30 +234,30 @@ ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:15 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:13 ; GFX10-NEXT: s_waitcnt lgkmcnt(14) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(12) -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v6, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX10-NEXT: v_lshl_or_b32 v5, v12, 8, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: v_perm_b32 v0, v1, v3, 0x1000504 +; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x1000504 +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x1000504 +; GFX10-NEXT: v_perm_b32 v3, v8, v7, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v4i32_align1: @@ -279,31 +276,31 @@ ; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 ; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 ; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 -; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 -; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 -; GFX11-NEXT: ds_load_u8 v15, v0 offset:14 -; GFX11-NEXT: ds_load_u8 v0, v0 offset:15 +; GFX11-NEXT: ds_load_u8 v13, v0 offset:14 +; GFX11-NEXT: ds_load_u8 v14, v0 offset:15 +; GFX11-NEXT: ds_load_u8 v15, v0 offset:12 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:13 ; GFX11-NEXT: s_waitcnt lgkmcnt(14) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(12) -; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(10) -; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(8) -; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX11-NEXT: v_lshl_or_b32 v6, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX11-NEXT: v_lshl_or_b32 v5, v12, 8, v11 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v7, v14, 8, v13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshl_or_b32 v8, v0, 8, v15 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v0, v1, v3, 0x1000504 +; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x1000504 +; GFX11-NEXT: v_perm_b32 v2, v6, v5, 0x1000504 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: v_perm_b32 v3, v8, v7, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load @@ -321,14 +318,15 @@ ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX9-NEXT: v_perm_b32 v1, v3, v4, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v5, v6, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX9-NEXT: v_perm_b32 v3, v7, v8, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align2: @@ -407,13 +405,13 @@ ; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v3, v4, 0x1000504 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v2, v5, v6, 0x1000504 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: v_perm_b32 v3, v7, v8, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v4i32_align2: @@ -429,13 +427,13 @@ ; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 ; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v1, v3, v4, 0x1000504 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v5, v6, 0x1000504 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: v_perm_b32 v3, v7, v8, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -67,21 +67,19 @@ ; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 ; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX9-NEXT: s_waitcnt lgkmcnt(10) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 8, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v8, 8, v7 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v12, 8, v11 +; GFX9-NEXT: v_lshl_or_b32 v3, v10, 8, v9 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align1: @@ -198,25 +196,25 @@ ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:9 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v11 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v1, v3, 0x1000504 +; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x1000504 +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v3i32_align1: @@ -231,26 +229,26 @@ ; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 ; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 -; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 -; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 -; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 -; GFX11-NEXT: ds_load_u8 v0, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:10 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:9 ; GFX11-NEXT: s_waitcnt lgkmcnt(10) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(8) -; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshl_or_b32 v6, v0, 8, v11 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v0, v1, v3, 0x1000504 +; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x1000504 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v6, v5, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load @@ -266,12 +264,13 @@ ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX9-NEXT: s_mov_b32 s4, 0x1000504 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX9-NEXT: v_perm_b32 v1, v3, v4, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v5, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align2: @@ -335,11 +334,11 @@ ; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v3, v4, 0x1000504 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v2, v5, v6, 0x1000504 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v3i32_align2: @@ -353,11 +352,11 @@ ; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 ; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v2, 0x1000504 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v1, v3, v4, 0x1000504 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v5, v6, 0x1000504 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -191,8 +191,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_mov_b32 s0, 0x1000504 +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 ; GFX8-NEXT: ;;#ASMEND @@ -271,10 +271,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -187,8 +187,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_mov_b32 s0, 0x1000504 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ; use v0 ; GFX803-NEXT: ;;#ASMEND @@ -265,10 +265,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s0, 0x1000504 ; GFX803-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -256,14 +256,16 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 +; GCN-NEXT: s_mov_b32 s1, 0x7060504 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] +; GCN-NEXT: s_or_b32 s0, s0, 0xff0000ff ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, v2, s1 +; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -284,6 +286,7 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_mov_b32 s1, 0xc060c04 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 @@ -296,8 +299,7 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, 4, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 +; GCN-NEXT: v_perm_b32 v4, v4, v4, s1 ; GCN-NEXT: v_or_b32_e32 v4, s0, v4 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v5 @@ -360,8 +362,8 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_mov_b32 s1, 0x7060504 ; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 -; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -369,9 +371,10 @@ ; GCN-NEXT: flat_load_dword v4, v[0:1] ; GCN-NEXT: s_or_b32 s0, s0, 4 ; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, 0x8000, v4 +; GCN-NEXT: v_perm_b32 v4, v4, v4, s1 ; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 ; GCN-NEXT: flat_store_dword v[0:1], v4 ; GCN-NEXT: flat_store_dword v[2:3], v6 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -0,0 +1,678 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 + + +define hidden void @shuffle6766(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %in1, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle6766: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle6766: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x6060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle3744(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %in1, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle3744: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3744: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0x307 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle4445(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %in1, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle4445: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle4445: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle0101(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %in1, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle0101: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0101: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle7533(<4 x i8> addrspace(0)* %in0, <4 x i8> addrspace(0)* %in1, <4 x i8> addrspace(0)* %out0) { +; GFX10-LABEL: shuffle7533: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7533: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x3030507 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(0)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(0)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(0)* %out0, align 4 + ret void +} + +define hidden void @shuffle7767(<4 x i8> addrspace(0)* %in0, <4 x i8> addrspace(0)* %in1, <4 x i8> addrspace(0)* %out0) { +; GFX10-LABEL: shuffle7767: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7767: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x7060707 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(0)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(0)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(0)* %out0, align 4 + ret void +} + +define hidden void @shuffle0554(<4 x i8> addrspace(3)* %in0, <4 x i8> addrspace(3)* %in1, <4 x i8> addrspace(3)* %out0) { +; GFX10-LABEL: shuffle0554: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ds_read_b32 v1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104 +; GFX10-NEXT: ds_write_b32 v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0554: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ds_read_b32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x10104 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: ds_write_b32 v2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(3)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(3)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(3)* %out0, align 4 + ret void +} + +define hidden void @shuffle2127(<4 x i8> addrspace(3)* %in0, <4 x i8> addrspace(3)* %in1, <4 x i8> addrspace(3)* %out0) { +; GFX10-LABEL: shuffle2127: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: ds_read_b32 v1, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506 +; GFX10-NEXT: ds_write_b32 v2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle2127: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: ds_read_b32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x3060506 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: ds_write_b32 v2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(3)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(3)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(3)* %out0, align 4 + ret void +} + +define hidden void @shuffle5047(<4 x i8> addrspace(5)* %in0, <4 x i8> addrspace(5)* %in1, <4 x i8> addrspace(5)* %out0) { +; GFX10-LABEL: shuffle5047: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005 +; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle5047: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; GFX9-NEXT: s_mov_b32 s4, 0x7040005 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v4, v3, s4 +; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(5)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(5)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(5)* %out0, align 4 + ret void +} + +define hidden void @shuffle3546(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %in1, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle3546: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3546: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x2000107 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + + +define hidden void @shuffle7330ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle7330ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7330ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x4070706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle5341ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle5341ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle5341ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle6106ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle6106ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle6106ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + + +define hidden void @shuffle4327ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle4327ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle4327ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle3263ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle3263ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3263ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060607 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle2763ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle2763ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle2763ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060706 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle1327ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle1327ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle1327ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x7060705 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @shuffle0605ud2(<4 x i8> addrspace(1)* %in0, <4 x i8> addrspace(1)* %out0) { +; GFX10-LABEL: shuffle0605ud2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0605ud2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0x5040504 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8> addrspace(1)* %in0, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out0, align 4 + ret void +} + +define hidden void @insertUsesOr(<4 x i8>* %in0, <4 x i8>* %in1, i8 %elt, <4 x i8>* %out0) { +; GFX10-LABEL: insertUsesOr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: flat_store_dword v[5:6], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: insertUsesOr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: flat_store_dword v[5:6], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 + store <4 x i8> %vecins, <4 x i8>* %out0 + ret void +} + +define hidden void @addUsesOr(<4 x i8>* %in0, <4 x i8>* %in1, i8 %elt, <4 x i8>* %out0) { +; GFX10-LABEL: addUsesOr: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v4, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 +; GFX10-NEXT: v_add_nc_u16 v2, v2, v3 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_add_nc_u16 v1, v4, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: flat_store_dword v[5:6], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: addUsesOr: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v4, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: flat_store_dword v[5:6], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> + %added = add <4 x i8> %shuffle0_0, %vec1 + store <4 x i8> %added, <4 x i8>* %out0 + ret void +} + + +define amdgpu_kernel void @shuffle8i8(<8 x i8> addrspace(1)* %in0, <8 x i8> addrspace(1)* %in1, <8 x i8> addrspace(1)* %out1) #0 { +; GFX10-LABEL: shuffle8i8: +; GFX10: ; %bb.0: ; %bb +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshr_b32 s1, s1, 8 +; GFX10-NEXT: s_lshr_b32 s4, s9, 16 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, s9 +; GFX10-NEXT: v_and_b32_e64 v1, 0xffffff00, s8 +; GFX10-NEXT: v_lshlrev_b16 v2, 8, s4 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, s8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: v_or_b32_sdwa v0, s1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX9-LABEL: shuffle8i8: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffffff00 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s1, s1, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s9 +; GFX9-NEXT: v_or_b32_sdwa v4, s1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_lshr_b32 s1, s9, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: v_lshlrev_b16_e64 v3, 8, s8 +; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, 8, s1 +; GFX9-NEXT: v_or_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm +bb: + %vec0 = load <8 x i8>, <8 x i8> addrspace(1)* %in0 + %vec1 = load <8 x i8>, <8 x i8> addrspace(1)* %in1 + %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> + store <8 x i8> %shuffle0, <8 x i8> addrspace(1)* %out1 + ret void +} +