diff --git a/llvm/include/llvm/CodeGen/DAGCombine.h b/llvm/include/llvm/CodeGen/DAGCombine.h --- a/llvm/include/llvm/CodeGen/DAGCombine.h +++ b/llvm/include/llvm/CodeGen/DAGCombine.h @@ -10,6 +10,8 @@ #ifndef LLVM_CODEGEN_DAGCOMBINE_H #define LLVM_CODEGEN_DAGCOMBINE_H +#include "llvm/CodeGen/SelectionDAGNodes.h" + namespace llvm { enum CombineLevel { @@ -19,6 +21,43 @@ AfterLegalizeDAG }; +/// Represents known origin of an individual byte in combine pattern. The +/// value of the byte is either constant zero, or comes from memory / +/// some other productive instruction (e.g. arithmetic instructions). +/// Bit manipulation instructions like shifts are not ByteProviders, rather +/// are used to extract Bytes. +struct ByteProvider { + // For constant zero providers Src is set to nullptr. For actual providers + // Stc represents the node which originally produced the relevant bits. + // ByteOffset is the offset of the byte in the value produced by the load. + SDNode *Src = nullptr; + unsigned DestOffset = 0; + unsigned SrcOffset = 0; + + ByteProvider() = default; + + static ByteProvider getSrc(SDNode *Load, unsigned ByteOffset, + unsigned VectorOffset) { + return ByteProvider(Load, ByteOffset, VectorOffset); + } + + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); } + bool isConstantZero() const { return !Src; } + + bool hasSrc() const { return Src; } + + bool hasSameSrc(const ByteProvider &Other) const { return Other.Src == Src; } + + bool operator==(const ByteProvider &Other) const { + return Other.Src == Src && Other.DestOffset == DestOffset && + Other.SrcOffset == SrcOffset; + } + +private: + ByteProvider(SDNode *Src, unsigned DestOffset, unsigned SrcOffset) + : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset) {} +}; + } // end llvm namespace #endif diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7779,42 +7779,6 @@ return SDValue(); } -namespace { - -/// Represents known origin of an individual byte in load combine pattern. The -/// value of the byte is either constant zero or comes from memory. -struct ByteProvider { - // For constant zero providers Load is set to nullptr. For memory providers - // Load represents the node which loads the byte from memory. - // ByteOffset is the offset of the byte in the value produced by the load. - LoadSDNode *Load = nullptr; - unsigned ByteOffset = 0; - unsigned VectorOffset = 0; - - ByteProvider() = default; - - static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset, - unsigned VectorOffset) { - return ByteProvider(Load, ByteOffset, VectorOffset); - } - - static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); } - - bool isConstantZero() const { return !Load; } - bool isMemory() const { return Load; } - - bool operator==(const ByteProvider &Other) const { - return Other.Load == Load && Other.ByteOffset == ByteOffset && - Other.VectorOffset == VectorOffset; - } - -private: - ByteProvider(LoadSDNode *Load, unsigned ByteOffset, unsigned VectorOffset) - : Load(Load), ByteOffset(ByteOffset), VectorOffset(VectorOffset) {} -}; - -} // end anonymous namespace - /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// @@ -7983,7 +7947,7 @@ : None; unsigned BPVectorIndex = VectorIndex.value_or(0U); - return ByteProvider::getMemory(L, Index, BPVectorIndex); + return ByteProvider::getSrc(L, Index, BPVectorIndex); } } @@ -8274,15 +8238,18 @@ bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { - assert(P.isMemory() && "Must be a memory byte provider"); - unsigned LoadBitWidth = P.Load->getMemoryVT().getScalarSizeInBits(); + assert(P.hasSrc() && "Must be a memory byte provider"); + LoadSDNode *Load = cast(P.Src); + assert(Load); + + unsigned LoadBitWidth = Load->getMemoryVT().getScalarSizeInBits(); assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; return IsBigEndianTarget - ? bigEndianByteAt(LoadByteWidth, P.ByteOffset) - : littleEndianByteAt(LoadByteWidth, P.ByteOffset); + ? bigEndianByteAt(LoadByteWidth, P.DestOffset) + : littleEndianByteAt(LoadByteWidth, P.DestOffset); }; Optional Base; @@ -8309,9 +8276,10 @@ return SDValue(); continue; } - assert(P->isMemory() && "provenance should either be memory or zero"); + assert(P->hasSrc() && "provenance should either be memory or zero"); - LoadSDNode *L = P->Load; + LoadSDNode *L = cast(P->Src); + assert(L); // All loads must share the same chain SDValue LChain = L->getChain(); @@ -8335,7 +8303,7 @@ unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits(); if (LoadWidthInBit % 8 != 0) return SDValue(); - unsigned ByteOffsetFromVector = P->VectorOffset * LoadWidthInBit / 8; + unsigned ByteOffsetFromVector = P->SrcOffset * LoadWidthInBit / 8; Ptr.addToOffset(ByteOffsetFromVector); } @@ -8392,7 +8360,8 @@ // So the combined value can be loaded from the first load address. if (MemoryByteOffset(*FirstByteProvider) != 0) return SDValue(); - LoadSDNode *FirstLoad = FirstByteProvider->Load; + LoadSDNode *FirstLoad = cast(FirstByteProvider->Src); + assert(FirstLoad); // The node we are looking at matches with the pattern, check if we can // replace it with a single (possibly zero-extended) load and bswap + shift if diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -35,6 +36,7 @@ #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" +#include using namespace llvm; @@ -9534,7 +9536,7 @@ // value 0-3 selects corresponding source byte; // value 0xc selects zero; // value 0xff selects 0xff. -static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { +static uint32_t getPermuteMask(SDValue V) { assert(V.getValueSizeInBits() == 32); if (V.getNumOperands() != 2) @@ -9550,15 +9552,13 @@ default: break; case ISD::AND: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); - } break; case ISD::OR: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ~ConstMask) | ConstMask; - } break; case ISD::SHL: @@ -9716,8 +9716,8 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9732,10 +9732,10 @@ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; // Check of we need to combine values from two sources within a byte. - if (!(LHSUsedLanes & RHSUsedLanes) && + if (!(LHSUsedLanes & RHSUsedLanes) || // If we select high and lower word keep it for SDWA. // TODO: teach SDWA to work with v_perm_b32 and remove the check. - !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + (LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { // Each byte in each mask is either selector mask 0-3, or has higher // bits set in either of masks, which can be 0xff for 0xff or 0x0c for // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise @@ -9763,6 +9763,246 @@ return SDValue(); } +// A key component of v_perm is a mapping between byte position of the src +// operands, and the byte position of the dest. To provide such, we need: 1. the +// node that provides x byte of the dest of the OR, and 2. the byte of the node +// used to provide that x byte. calculateByteProvider finds which node provides +// a certain byte of the dest of the OR, and calculateSrcByte takes that node, +// and finds an ultimate src and byte position For example: The supported +// LoadCombine pattern for vector loads is as follows +// t1 +// or +// / \ +// t2 t3 +// zext shl +// | | \ +// t4 t5 16 +// or anyext +// / \ | +// t6 t7 t8 +// srl shl or +// / | / \ / \ +// t9 t10 t11 t12 t13 t14 +// trunc* 8 trunc* 8 and and +// | | / | | \ +// t15 t16 t17 t18 t19 t20 +// trunc* 255 srl -256 +// | / \ +// t15 t15 16 +// +// *In this example, the truncs are from i32->i16 +// +// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 +// respectively. calculateSrcByte would find (given node) -> ultimate src & +// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. +// After finding the mapping, we can combine the tree into vperm t15, t16, +// 0x05000407 + +// Find the source and byte position from a node. +// \p DestByte is the byte position of the dest of the or that the src +// ultimately provides. \p SrcIndex is the byte of the src that maps to this +// dest of the or byte. \p Depth tracks how many recursive iterations we have +// performed. +static const std::optional calculateSrcByte(const SDValue *Op, + uint64_t DestByte, + uint64_t SrcIndex = 0, + unsigned Depth = 0) { + // We may need to recursively traverse a series of SRLs + if (Depth >= 5) + return std::nullopt; + + switch (Op->getOpcode()) { + case ISD::TRUNCATE: { + if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return std::nullopt; + return calculateSrcByte(&Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + + if (BitShift % 8 != 0) + return std::nullopt; + + SrcIndex += BitShift / 8; + + return calculateSrcByte(&Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + default: { + if (Op->getScalarValueSizeInBits() != 32) + return std::nullopt; + + return ByteProvider::getSrc(Op->getNode(), DestByte, SrcIndex); + } + } + llvm_unreachable("fully handled switch"); +} + +// For a byte position in the result of an Or, traverse the tree and find the +// node (and the byte of the node) which ultimately provides this {Or, +// BytePosition}. \p Op is the operand we are currently examining. \p Index is +// the byte position of the Op that corresponds with the originally requested +// byte of the Or \p Depth tracks how many recursive iterations we have +// performed. \p StartingIndex is the originally requested byte of the Or +static const std::optional +calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, + unsigned StartingIndex = 0) { + // Finding Src tree of RHS of or typically requires at least 1 additional + // depth + if (Depth >= 8) + return std::nullopt; + + unsigned BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8 != 0) + return std::nullopt; + assert(Index < BitWidth / 8 && "invalid index requested"); + + switch (Op.getOpcode()) { + case ISD::OR: { + auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1, + StartingIndex); + auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1, + StartingIndex); + // A well formed Or will only have nonzero bytes for one operand + if (LHS && RHS && !LHS->isConstantZero() && !RHS->isConstantZero()) + return std::nullopt; + if (!LHS || LHS->isConstantZero()) + return RHS; + if (!RHS || RHS->isConstantZero()) + return LHS; + return std::nullopt; + } + + case ISD::AND: { + auto BitMaskOp = dyn_cast(Op->getOperand(1)); + if (!BitMaskOp) + return std::nullopt; + + uint32_t BitMask = BitMaskOp->getZExtValue(); + // Bits we expect for our StartingIndex + uint32_t IndexMask = 0xFF << (Index * 8); + + if ((IndexMask & BitMask) != IndexMask) { + // If the result of the and partially provides the byte, then it + // is not well formatted + if (IndexMask & BitMask) + return std::nullopt; + return ByteProvider::getConstantZero(); + } + + return calculateSrcByte(&Op->getOperand(0), StartingIndex, Index); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8) + return std::nullopt; + + auto BitsProvided = Op.getScalarValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BytesProvided = BitsProvided / 8; + uint64_t ByteShift = BitShift / 8; + // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. + // If the byte we are trying to provide (as tracked by index) falls in this + // range, then the SRL provides the byte. The byte of interest of the src of + // the SRL is Index + ByteShift + return BytesProvided - ByteShift > Index + ? calculateSrcByte(&Op->getOperand(0), StartingIndex, + Index + ByteShift) + : ByteProvider::getConstantZero(); + } + + case ISD::SHL: { + auto ShiftOp = dyn_cast(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return std::nullopt; + uint64_t ByteShift = BitShift / 8; + + // If we are shifting by an amount greater than (or equal to) + // the index we are trying to provide, then it provides 0s. If not, + // then this bytes are not definitively 0s, and the corresponding byte + // of interest is Index - ByteShift of the src + return Index < ByteShift + ? ByteProvider::getConstantZero() + : calculateByteProvider(Op->getOperand(0), Index - ByteShift, + Depth + 1, StartingIndex); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? std::optional(ByteProvider::getConstantZero()) + : std::nullopt; + return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); + } + + case ISD::TRUNCATE: { + unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (NarrowByteWidth >= Index) { + return calculateByteProvider(Op->getOperand(0), Index, Depth + 1, + StartingIndex); + } + + return std::nullopt; + } + + case ISD::LOAD: { + auto L = cast(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question + if (Index >= NarrowByteWidth) + return L->getExtensionType() == ISD::ZEXTLOAD + ? std::optional(ByteProvider::getConstantZero()) + : std::nullopt; + + if (NarrowByteWidth > Index) { + return calculateSrcByte(const_cast(&Op), StartingIndex, + Index); + } + + return std::nullopt; + } + + default: { + return std::nullopt; + } + } + + llvm_unreachable("fully handled switch"); +} + SDValue SITargetLowering::performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -9813,8 +10053,36 @@ const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + + // If the users of the or are a BytewiseOp, then the result of a combnine will + // be extracted. We should simply not combine. + SmallVector BytewiseOps = {ISD::SINT_TO_FP, ISD::UINT_TO_FP}; + + bool IsCombineExtracted = false; + for (auto OrUse : N->uses()) { + // Only special case bitcast to vectors + if (OrUse->getOpcode() != ISD::BITCAST || !OrUse->getValueType(0).isVector()) { + continue; + } + + if (OrUse->hasOneUse()) + if (OrUse->use_begin()->getOpcode() == ISD::ZERO_EXTEND) + OrUse = *OrUse->use_begin(); + + for (auto VUse : OrUse->uses()) { + for (auto BytewiseOp : BytewiseOps) + if (VUse->getOpcode() == BytewiseOp) { + IsCombineExtracted = true; + break; + } + } + } + + if (IsCombineExtracted) return SDValue(); + + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9841,12 +10109,65 @@ // Combine masks uint32_t Sel = LHSMask | RHSMask; SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), RHS.getOperand(0), DAG.getConstant(Sel, DL, MVT::i32)); } } + if (LHSMask == ~0u || RHSMask == ~0u) { + SmallVector PermNodes; + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + assert(VT == MVT::i32); + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex*/ i); + // TODO support constantZero + if (!P.has_value() || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + int SecondSrc = -1; + uint64_t permMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc != -1) { + if (!PermOp.hasSameSrc(PermNodes[SecondSrc])) { + return SDValue(); + } + } + // Set the index of the second distinct Src node + SecondSrc = i; + assert(PermNodes[SecondSrc].Src->getValueType(0).getSizeInBits() == + 32); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + // 0th PermNode is MSB in PermMask + permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (24 - (i * 8)); + } + + SDLoc DL(N); + + return DAG.getNode( + AMDGPUISD::PERM, DL, MVT::i32, + SDValue(const_cast(PermNodes[FirstSrc].Src), 0), + SecondSrc == -1 + ? DAG.getConstant(0, DL, MVT::i32) + : SDValue(const_cast(PermNodes[SecondSrc].Src), 0), + DAG.getConstant(permMask, DL, MVT::i32)); + } } if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -38,20 +38,15 @@ ; GCN-LABEL: vectorLoadShuffle: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v3, 0x4060507 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x6050400 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v3, v2, 16, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v2 -; GCN-NEXT: v_perm_b32 v3, v3, v2, s0 -; GCN-NEXT: v_and_b32_e32 v4, 0xff0000, v4 -; GCN-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; GCN-NEXT: v_or3_b32 v2, v3, v4, v2 +; GCN-NEXT: v_perm_b32 v2, v2, 0, v3 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -96,8 +91,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: global_load_ushort v3, v[0:1], off offset:4 +; GCN-NEXT: s_mov_b32 s4, 0x4050001 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GCN-NEXT: v_perm_b32 v0, v2, v3, s4 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 2 %p.0 = load i16, i16 addrspace(1)* %p, align 4 @@ -238,9 +234,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v2, v[0:1], off ; GCN-NEXT: global_load_dword v3, v[0:1], off offset:4 -; GCN-NEXT: s_mov_b32 s4, 0xffff0000 +; GCN-NEXT: s_mov_b32 s4, 0x4050203 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_or_b32 v0, v3, s4, v2 +; GCN-NEXT: v_perm_b32 v0, v2, v3, s4 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2661,10 +2661,11 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x4050607 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; VI-NEXT: v_perm_b32 v0, v0, 0, v1 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -2678,7 +2679,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: global_store_dword v2, v0, s[2:3] @@ -2688,11 +2689,12 @@ ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4050607 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] @@ -2706,7 +2708,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -564,6 +564,7 @@ ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_mov_b32 s0, 0x4050001 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 @@ -574,16 +575,15 @@ ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:33 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:34 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:35 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v4, s0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm @@ -657,6 +657,7 @@ ; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x8 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-ALIGNED-NEXT: s_mov_b32 s0, 0x4050001 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_add_u32_e32 v1, s4, v0 ; GFX9-ALIGNED-NEXT: ds_read_u8 v2, v1 offset:5 @@ -667,16 +668,15 @@ ; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v1 offset:10 ; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v1 offset:11 ; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v1 offset:12 -; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 8, v4 +; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 8, v2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v4, s0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 8, v8 -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v3, v1, s0 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-ALIGNED-NEXT: s_endpgm @@ -746,10 +746,11 @@ ; GFX9-ALIGNED-NEXT: ds_read_u16 v3, v1 offset:2 ; GFX9-ALIGNED-NEXT: ds_read_u16 v4, v1 offset:32 ; GFX9-ALIGNED-NEXT: ds_read_u16 v1, v1 offset:34 +; GFX9-ALIGNED-NEXT: s_mov_b32 s2, 0x4050001 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-ALIGNED-NEXT: v_perm_b32 v2, v2, v3, s2 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshl_or_b32 v1, v1, 16, v4 +; GFX9-ALIGNED-NEXT: v_perm_b32 v1, v4, v1, s2 ; GFX9-ALIGNED-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX9-ALIGNED-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -38,8 +38,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX9-NEXT: v_perm_b32 v0, v2, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align2: @@ -50,7 +51,7 @@ ; GFX10-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-NEXT: global_load_ushort v3, v[0:1], off offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX10-NEXT: v_perm_b32 v0, v2, v3, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align2: @@ -61,7 +62,7 @@ ; GFX11-NEXT: global_load_u16 v2, v[0:1], off ; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -39,8 +39,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen ; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -48,8 +49,9 @@ ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off ; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 +; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0x4050001 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v0, v1, v2, s0 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align2: @@ -60,7 +62,7 @@ ; GFX10-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen ; GFX10-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -71,7 +73,7 @@ ; GFX10-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off ; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align2: @@ -82,7 +84,7 @@ ; GFX11-NEXT: scratch_load_u16 v1, v0, off ; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2: @@ -93,7 +95,7 @@ ; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v0, off ; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off offset:2 ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v0, v1, v0, 0x4050001 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 @@ -235,20 +237,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4050607 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff +; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0x4050607 ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX9-FLASTSCR-NEXT: v_perm_b32 v0, v0, 0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align1: @@ -257,8 +257,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -267,8 +266,7 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-FLASTSCR-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align1: @@ -277,9 +275,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1: @@ -288,9 +284,7 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-FLASTSCR-NEXT: v_perm_b32 v0, v0, 0, 0x4050607 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -755,10 +755,10 @@ ; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x6070203 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-NEXT: v_alignbit_b32 v2, v2, s4, 16 +; VI-NEXT: v_perm_b32 v2, s4, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1594,6 +1594,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4050203 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1601,11 +1602,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 +; VI-NEXT: v_perm_b32 v0, s4, v0, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1751,6 +1750,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4050203 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1758,11 +1758,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1908,6 +1906,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4050203 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 @@ -1915,11 +1914,9 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_mov_b32 s0, 0xffff -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 +; VI-NEXT: v_perm_b32 v1, s4, v1, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -2689,6 +2686,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; VI-NEXT: v_mov_b32_e32 v12, 0x4050203 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8 @@ -2701,11 +2699,9 @@ ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8 -; VI-NEXT: s_mov_b32 s2, 0xffff -; VI-NEXT: v_mov_b32_e32 v12, s4 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfi_b32 v3, s2, v12, v3 +; VI-NEXT: v_perm_b32 v3, s4, v3, v12 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -2258,9 +2258,9 @@ ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v1, v0 ; GFX803-NEXT: ds_read_u16 v0, v0 offset:2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_v2i16_split_multi_chain: @@ -2306,12 +2306,11 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: ds_read_u16 v1, v0 offset:16 -; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: ds_read_u16 v1, v0 +; GFX803-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_samechain: @@ -2411,9 +2410,9 @@ ; GFX803-NEXT: ds_read_u16 v2, v0 ; GFX803-NEXT: ds_write_b16 v1, v3 ; GFX803-NEXT: ds_read_u16 v0, v0 offset:16 +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_local_lo_hi_v2i16_side_effect: @@ -2469,8 +2468,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_global_v2i16_split: @@ -2523,9 +2522,10 @@ ; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 +; GFX803-NEXT: s_waitcnt lgkmcnt(0) +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_flat_v2i16_split: @@ -2575,9 +2575,9 @@ ; GFX803-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] glc ; GFX803-NEXT: flat_load_ushort v1, v[2:3] glc +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_constant_v2i16_split: @@ -2628,8 +2628,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:2 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050001 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-FLATSCR-LABEL: load_private_v2i16_split: diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -221,9 +221,9 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -682,12 +682,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_write_b16 v3, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -794,9 +794,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -836,9 +836,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1050,9 +1050,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1089,9 +1089,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1292,9 +1292,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1394,9 +1394,9 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094 -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1444,8 +1444,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1493,8 +1493,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1542,8 +1542,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1843,9 +1843,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1885,9 +1885,9 @@ ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0 ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX803-NEXT: v_perm_b32 v0, v0, v2, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -2026,8 +2026,8 @@ ; GFX803-NEXT: v_mov_b32_e32 v2, 44 ; GFX803-NEXT: buffer_load_ushort v1, v2, s[0:3], s32 offen offset:4054 glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX803-NEXT: s_mov_b32 s4, 0x4050203 +; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -71,26 +71,23 @@ ; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 ; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 ; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 -; GFX9-NEXT: s_waitcnt lgkmcnt(14) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(12) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(10) -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 8, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v8, 8, v7 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 +; GFX9-NEXT: v_lshl_or_b32 v2, v12, 8, v11 +; GFX9-NEXT: v_lshl_or_b32 v3, v10, 8, v9 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 -; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v16, 8, v15 +; GFX9-NEXT: v_lshl_or_b32 v4, v14, 8, v13 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align1: @@ -237,30 +234,30 @@ ; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 ; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 -; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 -; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 +; GFX10-NEXT: ds_read_u8 v13, v0 offset:14 +; GFX10-NEXT: ds_read_u8 v14, v0 offset:15 +; GFX10-NEXT: ds_read_u8 v15, v0 offset:12 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:13 ; GFX10-NEXT: s_waitcnt lgkmcnt(14) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(12) -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) -; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX10-NEXT: v_lshl_or_b32 v6, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX10-NEXT: v_lshl_or_b32 v5, v12, 8, v11 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: v_perm_b32 v0, v1, v3, 0x4050001 +; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x4050001 +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x4050001 +; GFX10-NEXT: v_perm_b32 v3, v8, v7, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v4i32_align1: @@ -279,31 +276,31 @@ ; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 ; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 ; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 -; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 -; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 -; GFX11-NEXT: ds_load_u8 v15, v0 offset:14 -; GFX11-NEXT: ds_load_u8 v0, v0 offset:15 +; GFX11-NEXT: ds_load_u8 v13, v0 offset:14 +; GFX11-NEXT: ds_load_u8 v14, v0 offset:15 +; GFX11-NEXT: ds_load_u8 v15, v0 offset:12 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:13 ; GFX11-NEXT: s_waitcnt lgkmcnt(14) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(12) -; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(10) -; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(8) -; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX11-NEXT: v_lshl_or_b32 v6, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX11-NEXT: v_lshl_or_b32 v5, v12, 8, v11 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v7, v14, 8, v13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshl_or_b32 v8, v0, 8, v15 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v0, v1, v3, 0x4050001 +; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x4050001 +; GFX11-NEXT: v_perm_b32 v2, v6, v5, 0x4050001 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: v_perm_b32 v3, v8, v7, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load @@ -321,14 +318,15 @@ ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 ; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX9-NEXT: v_perm_b32 v1, v3, v4, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v5, v6, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX9-NEXT: v_perm_b32 v3, v7, v8, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align2: @@ -407,13 +405,13 @@ ; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 ; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v3, v4, 0x4050001 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v2, v5, v6, 0x4050001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX10-NEXT: v_perm_b32 v3, v7, v8, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v4i32_align2: @@ -429,13 +427,13 @@ ; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 ; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v1, v3, v4, 0x4050001 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v5, v6, 0x4050001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: v_perm_b32 v3, v7, v8, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -67,21 +67,19 @@ ; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 ; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 ; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 -; GFX9-NEXT: s_waitcnt lgkmcnt(10) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(8) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(6) -; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 8, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 +; GFX9-NEXT: v_lshl_or_b32 v1, v8, 8, v7 +; GFX9-NEXT: v_lshl_or_b32 v2, v6, 8, v5 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 -; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v2, v12, 8, v11 +; GFX9-NEXT: v_lshl_or_b32 v3, v10, 8, v9 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align1: @@ -198,25 +196,25 @@ ; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 ; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 ; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 -; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 -; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 -; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 -; GFX10-NEXT: ds_read_u8 v0, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v9, v0 offset:10 +; GFX10-NEXT: ds_read_u8 v10, v0 offset:11 +; GFX10-NEXT: ds_read_u8 v11, v0 offset:8 +; GFX10-NEXT: ds_read_u8 v0, v0 offset:9 ; GFX10-NEXT: s_waitcnt lgkmcnt(10) ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(8) -; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(6) -; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX10-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v6, v0, 8, v11 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v0, v1, v3, 0x4050001 +; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x4050001 +; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v3i32_align1: @@ -231,26 +229,26 @@ ; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 ; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 ; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 -; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 -; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 -; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 -; GFX11-NEXT: ds_load_u8 v0, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:10 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:9 ; GFX11-NEXT: s_waitcnt lgkmcnt(10) ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX11-NEXT: s_waitcnt lgkmcnt(8) -; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: v_lshl_or_b32 v3, v4, 8, v3 ; GFX11-NEXT: s_waitcnt lgkmcnt(6) -; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: v_lshl_or_b32 v2, v8, 8, v7 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_lshl_or_b32 v6, v0, 8, v11 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v0, v1, v3, 0x4050001 +; GFX11-NEXT: v_perm_b32 v1, v4, v2, 0x4050001 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v6, v5, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load @@ -266,12 +264,13 @@ ; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 ; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 +; GFX9-NEXT: s_mov_b32 s4, 0x4050001 ; GFX9-NEXT: s_waitcnt lgkmcnt(4) -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX9-NEXT: v_perm_b32 v0, v1, v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX9-NEXT: v_perm_b32 v1, v3, v4, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX9-NEXT: v_perm_b32 v2, v5, v6, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align2: @@ -335,11 +334,11 @@ ; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 ; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 ; GFX10-NEXT: s_waitcnt lgkmcnt(4) -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX10-NEXT: s_waitcnt lgkmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX10-NEXT: v_perm_b32 v1, v3, v4, 0x4050001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX10-NEXT: v_perm_b32 v2, v5, v6, 0x4050001 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: load_lds_v3i32_align2: @@ -353,11 +352,11 @@ ; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 ; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 ; GFX11-NEXT: s_waitcnt lgkmcnt(4) -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v2, 0x4050001 ; GFX11-NEXT: s_waitcnt lgkmcnt(2) -; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_perm_b32 v1, v3, v4, 0x4050001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: v_perm_b32 v2, v5, v6, 0x4050001 ; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -191,8 +191,8 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_mov_b32 s0, 0x4050001 +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 ; GFX8-NEXT: ;;#ASMEND @@ -271,10 +271,10 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_mov_b32 s0, 0x4050001 ; GFX8-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -187,8 +187,8 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: s_mov_b32 s0, 0x4050001 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: ;;#ASMSTART ; GFX803-NEXT: ; use v0 ; GFX803-NEXT: ;;#ASMEND @@ -265,10 +265,10 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: flat_load_dword v1, v[2:3] glc ; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: s_mov_b32 s0, 0x4050001 ; GFX803-NEXT: s_mov_b32 s3, 0x1100f000 ; GFX803-NEXT: s_mov_b32 s2, -1 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s0 ; GFX803-NEXT: v_add_u32_e32 v0, vcc, 9, v0 ; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/permute.ll b/llvm/test/CodeGen/AMDGPU/permute.ll --- a/llvm/test/CodeGen/AMDGPU/permute.ll +++ b/llvm/test/CodeGen/AMDGPU/permute.ll @@ -256,14 +256,16 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, 0x7020104 +; GCN-NEXT: v_mov_b32_e32 v3, 0x4050607 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: flat_load_dword v2, v[0:1] +; GCN-NEXT: s_or_b32 s0, s0, 0xff0000ff ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_perm_b32 v2, v2, s0, v3 +; GCN-NEXT: v_perm_b32 v2, v2, 0, v3 +; GCN-NEXT: v_and_b32_e32 v2, s0, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm bb: @@ -284,7 +286,8 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0xffff8004 +; GCN-NEXT: v_mov_b32_e32 v5, 0xc050c07 +; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -296,11 +299,10 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, 4, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff00ff, v4 +; GCN-NEXT: v_perm_b32 v4, v4, 0, v5 ; GCN-NEXT: v_or_b32_e32 v4, s0, v4 ; GCN-NEXT: flat_store_dword v[0:1], v4 -; GCN-NEXT: flat_store_dword v[2:3], v5 +; GCN-NEXT: flat_store_dword v[2:3], v6 ; GCN-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -360,8 +362,8 @@ ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0xffff0500 -; GCN-NEXT: v_mov_b32_e32 v6, 0xffff8004 +; GCN-NEXT: v_mov_b32_e32 v5, 0x4050607 +; GCN-NEXT: v_mov_b32_e32 v6, 0xffff0500 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -369,12 +371,13 @@ ; GCN-NEXT: flat_load_dword v4, v[0:1] ; GCN-NEXT: s_or_b32 s0, s0, 4 ; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v7, 0xffff8004 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v4, 0x8000, v4 -; GCN-NEXT: v_perm_b32 v4, v4, s0, v5 +; GCN-NEXT: v_perm_b32 v4, v4, 0, v5 +; GCN-NEXT: v_perm_b32 v4, v4, s0, v6 ; GCN-NEXT: flat_store_dword v[0:1], v4 -; GCN-NEXT: flat_store_dword v[2:3], v6 +; GCN-NEXT: flat_store_dword v[2:3], v7 ; GCN-NEXT: s_endpgm bb: %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -0,0 +1,316 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 + +define hidden void @shuffle6766(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle6766: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x6070606 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle6766: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0x6070606 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v1 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle3746(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle3746: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x7030000 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3746: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x7030000 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle4445(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle4445: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x4040405 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle4445: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4040405 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v1 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle0101(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle0101: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x4050405 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0101: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4050405 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v1 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle7533(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle7533: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x7050303 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7533: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x7050303 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle7767(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle7767: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v0, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v0, 0, 0x7070607 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle7767: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v0, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7070607 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v0, 0, v1 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle0554(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle0554: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x4010100 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle0554: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x4010100 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle2127(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle2127: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x6050603 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle2127: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x6050603 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle5047(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle5047: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x5000407 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle5047: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x5000407 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} + +define hidden void @shuffle3546(<4 x i8>* %in0, <4 x i8>* %in1, <4 x i8>* %out0) { +; GFX10-LABEL: shuffle3546: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: flat_load_dword v6, v[0:1] +; GFX10-NEXT: flat_load_dword v7, v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x7010002 +; GFX10-NEXT: flat_store_dword v[4:5], v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shuffle3546: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_load_dword v6, v[0:1] +; GFX9-NEXT: flat_load_dword v7, v[2:3] +; GFX9-NEXT: s_mov_b32 s4, 0x7010002 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: flat_store_dword v[4:5], v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = load <4 x i8>, <4 x i8>* %in0, align 4 + %vec1 = load <4 x i8>, <4 x i8>* %in1, align 4 + %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> + store <4 x i8> %shuffle0_0, <4 x i8>* %out0, align 4 + ret void +} +