diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h --- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h @@ -49,6 +49,9 @@ SDValue getBase() const { return Base; } SDValue getIndex() { return Index; } SDValue getIndex() const { return Index; } + void addToOffset(int64_t VectorOff) { + Offset = Offset.value_or(0) + VectorOff; + } bool hasValidOffset() const { return Offset.has_value(); } int64_t getOffset() const { return *Offset; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7785,25 +7785,28 @@ // ByteOffset is the offset of the byte in the value produced by the load. LoadSDNode *Load = nullptr; unsigned ByteOffset = 0; + unsigned VectorOffset = 0; ByteProvider() = default; - static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { - return ByteProvider(Load, ByteOffset); + static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset, + unsigned VectorOffset) { + return ByteProvider(Load, ByteOffset, VectorOffset); } - static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } + static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0, 0); } bool isConstantZero() const { return !Load; } bool isMemory() const { return Load; } bool operator==(const ByteProvider &Other) const { - return Other.Load == Load && Other.ByteOffset == ByteOffset; + return Other.Load == Load && Other.ByteOffset == ByteOffset && + Other.VectorOffset == VectorOffset; } private: - ByteProvider(LoadSDNode *Load, unsigned ByteOffset) - : Load(Load), ByteOffset(ByteOffset) {} + ByteProvider(LoadSDNode *Load, unsigned ByteOffset, unsigned VectorOffset) + : Load(Load), ByteOffset(ByteOffset), VectorOffset(VectorOffset) {} }; } // end anonymous namespace @@ -7811,25 +7814,63 @@ /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// -/// For all the values except the root of the expression verifies that the value -/// has exactly one use and if it's not true return None. This way if the origin -/// of the byte is returned it's guaranteed that the values which contribute to -/// the byte are not used outside of this expression. +/// For all the values except the root of the expression, we verify that the +/// value has exactly one use and if not then return None. This way if the +/// origin of the byte is returned it's guaranteed that the values which +/// contribute to the byte are not used outside of this expression. + +/// However, there is a special case when dealing with vector loads -- we allow +/// more than one use if the load is a vector type. Since the values that +/// contribute to the byte ultimately come from the ExtractVectorElements of the +/// Load, we don't care if the Load has uses other than ExtractVectorElements, +/// because those operations are independent from the pattern to be combined. +/// For vector loads, we simply care that the ByteProviders are adjacent +/// positions of the same vector, and their index matches the byte that is being +/// provided. This is captured by the \p VectorIndex algorithm. \p VectorIndex +/// is the index used in an ExtractVectorElement, and \p StartingIndex is the +/// byte position we are trying to provide for the LoadCombine. If these do +/// not match, then we can not combine the vector loads. \p Index uses the +/// byte position we are trying to provide for and is matched against the +/// shl and load size. The \p Index algorithm ensures the requested byte is +/// provided for by the pattern, and the pattern does not over provide bytes. /// -/// Because the parts of the expression are not allowed to have more than one -/// use this function iterates over trees, not DAGs. So it never visits the same -/// node more than once. +/// +/// The supported LoadCombine pattern for vector loads is as follows +/// or +/// / \ +/// or shl +/// / \ | +/// or shl zext +/// / \ | | +/// shl zext zext EVE* +/// | | | | +/// zext EVE* EVE* LOAD +/// | | | +/// EVE* LOAD LOAD +/// | +/// LOAD +/// +/// *ExtractVectorElement static const Optional calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, - bool Root = false) { + Optional VectorIndex, + unsigned StartingIndex = 0) { + // Typical i64 by i8 pattern requires recursion up to 8 calls depth if (Depth == 10) return None; - if (!Root && !Op.hasOneUse()) + // Only allow multiple uses if the instruction is a vector load (in which + // case we will use the load for every ExtractVectorElement) + if (Depth && !Op.hasOneUse() && + (Op.getOpcode() != ISD::LOAD || !Op.getValueType().isVector())) + return None; + + // Fail to combine if we have encountered anything but a LOAD after handling + // an ExtractVectorElement. + if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value()) return None; - assert(Op.getValueType().isScalarInteger() && "can't handle other types"); unsigned BitWidth = Op.getValueSizeInBits(); if (BitWidth % 8 != 0) return None; @@ -7839,10 +7880,12 @@ switch (Op.getOpcode()) { case ISD::OR: { - auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); + auto LHS = + calculateByteProvider(Op->getOperand(0), Index, Depth + 1, VectorIndex); if (!LHS) return None; - auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); + auto RHS = + calculateByteProvider(Op->getOperand(1), Index, Depth + 1, VectorIndex); if (!RHS) return None; @@ -7858,14 +7901,18 @@ return None; uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) return None; uint64_t ByteShift = BitShift / 8; + // If we are shifting by an amount greater than the index we are trying to + // provide, then do not provide anything. Otherwise, subtract the index by + // the amount we shifted by. return Index < ByteShift ? ByteProvider::getConstantZero() : calculateByteProvider(Op->getOperand(0), Index - ByteShift, - Depth + 1); + Depth + 1, VectorIndex, Index); } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -7880,11 +7927,39 @@ return Op.getOpcode() == ISD::ZERO_EXTEND ? Optional(ByteProvider::getConstantZero()) : None; - return calculateByteProvider(NarrowOp, Index, Depth + 1); + return calculateByteProvider(NarrowOp, Index, Depth + 1, VectorIndex, + StartingIndex); } case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, - Depth + 1); + Depth + 1, VectorIndex, StartingIndex); + case ISD::EXTRACT_VECTOR_ELT: { + auto OffsetOp = dyn_cast(Op->getOperand(1)); + if (!OffsetOp) + return None; + + VectorIndex = OffsetOp->getZExtValue(); + + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return None; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // Check to see if the position of the element in the vector corresponds + // with the byte we are trying to provide for. In the case of a vector of + // i8, this simply means the VectorIndex == StartingIndex. For non i8 cases, + // the element will provide a range of bytes. For example, if we have a + // vector of i16s, each element provides two bytes (V[1] provides byte 2 and + // 3). + if (VectorIndex.value() * NarrowByteWidth > StartingIndex) + return None; + if ((VectorIndex.value() + 1) * NarrowByteWidth <= StartingIndex) + return None; + + return calculateByteProvider(Op->getOperand(0), Index, Depth + 1, + VectorIndex, StartingIndex); + } case ISD::LOAD: { auto L = cast(Op.getNode()); if (!L->isSimple() || L->isIndexed()) @@ -7895,11 +7970,16 @@ return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question if (Index >= NarrowByteWidth) return L->getExtensionType() == ISD::ZEXTLOAD ? Optional(ByteProvider::getConstantZero()) : None; - return ByteProvider::getMemory(L, Index); + + unsigned BPVectorIndex = VectorIndex.value_or(0U); + return ByteProvider::getMemory(L, Index, BPVectorIndex); } } @@ -8191,7 +8271,8 @@ bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); - unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); + unsigned LoadBitWidth = P.Load->getMemoryVT().getScalarSizeInBits(); + assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; @@ -8212,7 +8293,8 @@ SmallVector ByteOffsets(ByteWidth); unsigned ZeroExtendedBytes = 0; for (int i = ByteWidth - 1; i >= 0; --i) { - auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); + auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*VectorIndex*/ None, + /*StartingIndex*/ i); if (!P) return SDValue(); @@ -8226,10 +8308,6 @@ assert(P->isMemory() && "provenance should either be memory or zero"); LoadSDNode *L = P->Load; - assert(L->hasNUsesOfValue(1, 0) && L->isSimple() && - !L->isIndexed() && - "Must be enforced by calculateByteProvider"); - assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); // All loads must share the same chain SDValue LChain = L->getChain(); @@ -8241,8 +8319,25 @@ // Loads must share the same base address BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); int64_t ByteOffsetFromBase = 0; + + // For vector loads, the expected load combine pattern will have an + // ExtractElement for each index in the vector. While each of these + // ExtractElements will be accessing the same base address as determined + // by the load instruction, the actual bytes they interact with will differ + // due to different ExtractElement indices. To accurately determine the + // byte position of an ExtractElement, we offset the base load ptr with + // the index multiplied by the byte size of each element in the vector. + if (L->getMemoryVT().isVector()) { + unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits(); + if (LoadWidthInBit % 8 != 0) + return SDValue(); + unsigned ByteOffsetFromVector = P->VectorOffset * LoadWidthInBit / 8; + Ptr.addToOffset(ByteOffsetFromVector); + } + if (!Base) Base = Ptr; + else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) return SDValue(); @@ -8258,6 +8353,7 @@ Loads.insert(L); } + assert(!Loads.empty() && "All the bytes of the value must be loaded from " "memory, so there must be at least one load which produces the value"); assert(Base && "Base address of the accessed memory location must be set"); diff --git a/llvm/test/CodeGen/AArch64/load-combine.ll b/llvm/test/CodeGen/AArch64/load-combine.ll --- a/llvm/test/CodeGen/AArch64/load-combine.ll +++ b/llvm/test/CodeGen/AArch64/load-combine.ll @@ -562,18 +562,11 @@ ret i32 %tmp8 } +; x1 = x0 define void @short_vector_to_i32(<4 x i8>* %in, i32* %out, i32* %p) { ; CHECK-LABEL: short_vector_to_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -638,13 +631,11 @@ ; CHECK-LABEL: short_vector_to_i32_unused_high_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: bfi w8, w9, #8, #8 -; CHECK-NEXT: bfi w8, w10, #16, #8 -; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: bfi w9, w8, #16, #8 +; CHECK-NEXT: str w9, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -694,14 +685,11 @@ ret void } +; x1 = x0[0:1] define void @short_vector_to_i32_unused_high_i16(<4 x i8>* %in, i32* %out, i32* %p) { ; CHECK-LABEL: short_vector_to_i32_unused_high_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: str w8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 @@ -720,18 +708,11 @@ ret void } +; x1 = x0 define void @short_vector_to_i64(<4 x i8>* %in, i64* %out, i64* %p) { ; CHECK-LABEL: short_vector_to_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr s0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: bfi x8, x9, #8, #8 -; CHECK-NEXT: bfi x8, x10, #16, #8 -; CHECK-NEXT: bfi x8, x11, #24, #8 +; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: ret %ld = load <4 x i8>, <4 x i8>* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll --- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll @@ -9,15 +9,9 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: s_mov_b32 s0, 0x6050400 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 -; GCN-NEXT: v_and_b32_e32 v4, 0xff0000, v2 -; GCN-NEXT: v_perm_b32 v3, v3, v2, s0 -; GCN-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; GCN-NEXT: v_or3_b32 v2, v3, v4, v2 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -84,10 +78,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dword v0, v[0:1], off -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 @@ -162,8 +153,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i32 1 %gep.2p = getelementptr i16, i16 addrspace(1)* %p, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -184,10 +184,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align1: @@ -196,8 +193,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align1: @@ -206,9 +201,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 1 @@ -293,13 +285,6 @@ ; Should merge this to a dword load define i32 @global_load_2xi16_align4(i16 addrspace(1)* %p) #0 { -; GFX7-LABEL: load_2xi16_align4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_setpc_b64 s[30:31] -; ; GFX7-ALIGNED-LABEL: global_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -318,10 +303,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_load_2xi16_align4: @@ -330,8 +312,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: global_load_2xi16_align4: @@ -340,9 +320,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -387,13 +387,6 @@ ; Should merge this to a dword load define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { -; GFX7-LABEL: load_2xi16_align4: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_setpc_b64 s[30:31] -; ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -412,20 +405,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: ; GFX9-FLASTSCR: ; %bb.0: ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off -; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff ; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 ; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: private_load_2xi16_align4: @@ -434,8 +421,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4: @@ -444,8 +429,6 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: private_load_2xi16_align4: @@ -454,9 +437,6 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4: @@ -465,9 +445,6 @@ ; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off ; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLASTSCR-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4