diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -58,6 +58,7 @@ /// of the extension AssertSext, AssertZext, + AssertAlign, /// Various leaf nodes. BasicBlock, diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1342,6 +1342,9 @@ /// Return a freeze using the SDLoc of the value operand. SDValue getFreeze(SDValue V); + /// Return an AssertAlignSDNode. + SDValue getAssertAlign(const SDLoc &DL, SDValue V, Align A); + /// Return the specified value casted to /// the target's desired shift amount type. SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -2526,6 +2526,22 @@ } }; +/// An SDNode that records if a register contains a value that is guaranteed to +/// be aligned accordingly. +class AssertAlignSDNode : public SDNode { + Align Alignment; + +public: + AssertAlignSDNode(unsigned Order, const DebugLoc &DL, EVT VT, Align A) + : SDNode(ISD::AssertAlign, Order, DL, getSDVTList(VT)), Alignment(A) {} + + Align getAlign() const { return Alignment; } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::AssertAlign; + } +}; + class SDNodeIterator : public std::iterator { const SDNode *Node; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -667,10 +667,11 @@ def intrinsic_wo_chain : SDNode<"ISD::INTRINSIC_WO_CHAIN", SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>, []>; -def SDT_assertext : SDTypeProfile<1, 1, +def SDT_assert : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>; -def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>; -def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>; +def assertsext : SDNode<"ISD::AssertSext", SDT_assert>; +def assertzext : SDNode<"ISD::AssertZext", SDT_assert>; +def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -455,6 +455,7 @@ SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); SDValue visitAssertExt(SDNode *N); + SDValue visitAssertAlign(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); @@ -1601,6 +1602,7 @@ case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::AssertSext: case ISD::AssertZext: return visitAssertExt(N); + case ISD::AssertAlign: return visitAssertAlign(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); @@ -10696,6 +10698,45 @@ return SDValue(); } +SDValue DAGCombiner::visitAssertAlign(SDNode *N) { + SDLoc DL(N); + + Align AL = cast(N)->getAlign(); + SDValue N0 = N->getOperand(0); + + // Fold (assertalign (assertalign x, AL0), AL1) -> + // (assertalign x, max(AL0, AL1)) + if (auto *AAN = dyn_cast(N0)) + return DAG.getAssertAlign(DL, N0.getOperand(0), + std::max(AL, AAN->getAlign())); + + // In rare cases, there are trivial arithmetic ops in source operands. Sink + // this assert down to source operands so that those arithmetic ops could be + // exposed to the DAG combining. + switch (N0.getOpcode()) { + default: + break; + case ISD::ADD: + case ISD::SUB: { + unsigned AlignShift = Log2(AL); + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros(); + unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros(); + if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) { + if (LHSAlignShift < AlignShift) + LHS = DAG.getAssertAlign(DL, LHS, AL); + if (RHSAlignShift < AlignShift) + RHS = DAG.getAssertAlign(DL, RHS, AL); + return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS); + } + break; + } + } + + return SDValue(); +} + /// If the result of a wider load is shifted to right of N bits and then /// truncated to a narrower type and where N is a multiple of number of bits of /// the narrower type, transform it to a narrower load from address + N / num of diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3166,6 +3166,15 @@ Known.One &= (~Known.Zero); break; } + case ISD::AssertAlign: { + unsigned LogOfAlign = Log2(cast(Op)->getAlign()); + assert(LogOfAlign != 0); + // If a node is guaranteed to be aligned, set low zero bits accordingly as + // well as clearing one bits. + Known.Zero.setLowBits(LogOfAlign); + Known.One.clearLowBits(LogOfAlign); + break; + } case ISD::FGETSIGN: // All bits are zero except the low bit. Known.Zero.setBitsFrom(1); @@ -5186,6 +5195,34 @@ return SDValue(); } +SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) { + assert(Val.getValueType().isInteger() && "Invalid AssertAlign!"); + + // There's no need to assert on a byte-aligned pointer. All pointers are at + // least byte aligned. + if (A == Align(1)) + return Val; + + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::AssertAlign, getVTList(Val.getValueType()), {Val}); + ID.AddInteger(A.value()); + + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) + return SDValue(E, 0); + + auto *N = newSDNode(DL.getIROrder(), DL.getDebugLoc(), + Val.getValueType(), A); + createOperands(N, {Val}); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1, SDValue N2, const SDNodeFlags Flags) { ConstantSDNode *N1C = dyn_cast(N1); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -135,6 +135,11 @@ /// some float libcalls (6, 8 or 12 bits). static unsigned LimitFloatPrecision; +static cl::opt + InsertAssertAlign("insert-assert-align", cl::init(true), + cl::desc("Insert the experimental `assertalign` node."), + cl::ReallyHidden); + static cl::opt LimitFPPrecision("limit-float-precision", cl::desc("Generate low-precision inline sequences " @@ -4747,6 +4752,15 @@ } else Result = lowerRangeToAssertZExt(DAG, I, Result); + MaybeAlign Alignment = I.getRetAlign(); + if (!Alignment) + Alignment = F->getAttributes().getRetAlignment(); + // Insert `assertalign` node if there's an alignment. + if (InsertAssertAlign && Alignment) { + Result = + DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne()); + } + setValue(&I, Result); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -106,6 +106,7 @@ case ISD::TokenFactor: return "TokenFactor"; case ISD::AssertSext: return "AssertSext"; case ISD::AssertZext: return "AssertZext"; + case ISD::AssertAlign: return "AssertAlign"; case ISD::BasicBlock: return "BasicBlock"; case ISD::VALUETYPE: return "ValueType"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2820,6 +2820,7 @@ return; case ISD::AssertSext: case ISD::AssertZext: + case ISD::AssertAlign: ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0)); CurDAG->RemoveDeadNode(NodeToMatch); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1628,6 +1628,37 @@ llvm_unreachable("cannot find MemSDNode in the pattern!"); } +static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, + SDValue &N0, SDValue &N1) { + if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && + Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e. + // (i64 (bitcast (v2i32 (build_vector + // (or (extract_vector_elt V, 0), OFFSET), + // (extract_vector_elt V, 1))))) + SDValue Lo = Addr.getOperand(0).getOperand(0); + if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) { + SDValue BaseLo = Lo.getOperand(0); + SDValue BaseHi = Addr.getOperand(0).getOperand(1); + // Check that split base (Lo and Hi) are extracted from the same one. + if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseLo.getOperand(0) == BaseHi.getOperand(0) && + // Lo is statically extracted from index 0. + isa(BaseLo.getOperand(1)) && + BaseLo.getConstantOperandVal(1) == 0 && + // Hi is statically extracted from index 0. + isa(BaseHi.getOperand(1)) && + BaseHi.getConstantOperandVal(1) == 1) { + N0 = BaseLo.getOperand(0).getOperand(0); + N1 = Lo.getOperand(1); + return true; + } + } + } + return false; +} + template bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, @@ -1638,84 +1669,91 @@ if (Subtarget->hasFlatInstOffsets() && (!Subtarget->hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) && - CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - uint64_t COffsetVal = cast(N1)->getSExtValue(); - - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned AS = findMemSDNode(N)->getAddressSpace(); - if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - - SDLoc DL(N); - uint64_t ImmField; - const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); - if (IsSigned) { - ImmField = SignExtend64(COffsetVal, NumBits); - - // Don't use a negative offset field if the base offset is positive. - // Since the scheduler currently relies on the offset field, doing so - // could result in strange scheduling decisions. - - // TODO: Should we not do this in the opposite direction as well? - if (static_cast(COffsetVal) > 0) { - if (static_cast(ImmField) < 0) { - const uint64_t OffsetMask = maskTrailingOnes(NumBits - 1); - ImmField = COffsetVal & OffsetMask; + findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) { + SDValue N0, N1; + if (CurDAG->isBaseWithConstantOffset(Addr)) { + N0 = Addr.getOperand(0); + N1 = Addr.getOperand(1); + } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { + assert(N0 && N1 && isa(N1)); + } + if (N0 && N1) { + uint64_t COffsetVal = cast(N1)->getSExtValue(); + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + Addr = N0; + OffsetVal = COffsetVal; + } else { + // If the offset doesn't fit, put the low bits into the offset field and + // add the rest. + + SDLoc DL(N); + uint64_t ImmField; + const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); + if (IsSigned) { + ImmField = SignExtend64(COffsetVal, NumBits); + + // Don't use a negative offset field if the base offset is positive. + // Since the scheduler currently relies on the offset field, doing so + // could result in strange scheduling decisions. + + // TODO: Should we not do this in the opposite direction as well? + if (static_cast(COffsetVal) > 0) { + if (static_cast(ImmField) < 0) { + const uint64_t OffsetMask = + maskTrailingOnes(NumBits - 1); + ImmField = COffsetVal & OffsetMask; + } } + } else { + // TODO: Should we do this for a negative offset? + const uint64_t OffsetMask = maskTrailingOnes(NumBits); + ImmField = COffsetVal & OffsetMask; } - } else { - // TODO: Should we do this for a negative offset? - const uint64_t OffsetMask = maskTrailingOnes(NumBits); - ImmField = COffsetVal & OffsetMask; - } - uint64_t RemainderOffset = COffsetVal - ImmField; + uint64_t RemainderOffset = COffsetVal - ImmField; - assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); - assert(RemainderOffset + ImmField == COffsetVal); + assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); + assert(RemainderOffset + ImmField == COffsetVal); - OffsetVal = ImmField; + OffsetVal = ImmField; - // TODO: Should this try to use a scalar add pseudo if the base address is - // uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + // TODO: Should this try to use a scalar add pseudo if the base address + // is uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, + MVT::i32, N0, Sub1); - SDValue AddOffsetLo - = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue AddOffsetHi - = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - SDNode *Add = CurDAG->getMachineNode( - AMDGPU::V_ADD_I32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 - }; + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), 0); + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } @@ -1824,15 +1862,21 @@ // A 32-bit (address + offset) should not cause unsigned 32-bit integer // wraparound, because s_load instructions perform the addition in 64 bits. if ((Addr.getValueType() != MVT::i32 || - Addr->getFlags().hasNoUnsignedWrap()) && - (CurDAG->isBaseWithConstantOffset(Addr) || - Addr.getOpcode() == ISD::ADD)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - - if (SelectSMRDOffset(N1, Offset, Imm)) { - SBase = Expand32BitAddress(N0); - return true; + Addr->getFlags().hasNoUnsignedWrap())) { + SDValue N0, N1; + // Extract the base and offset if possible. + if (CurDAG->isBaseWithConstantOffset(Addr) || + Addr.getOpcode() == ISD::ADD) { + N0 = Addr.getOperand(0); + N1 = Addr.getOperand(1); + } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { + assert(N0 && N1 && isa(N1)); + } + if (N0 && N1) { + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = Expand32BitAddress(N0); + return true; + } } } SBase = Expand32BitAddress(Addr); diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -30,10 +30,9 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_add_u32 s0, s4, 14 -; HAWAII-NEXT: s_addc_u32 s1, s5, 0 +; HAWAII-NEXT: s_or_b32 s0, s4, 14 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 @@ -52,27 +51,26 @@ ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s0 -; FIJI-NEXT: s_and_b32 s3, s1, 0xffff -; FIJI-NEXT: s_add_u32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v1, s0 ; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_addc_u32 s1, s5, 0 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: ds_write_b16 v2, v3 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v3, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 ; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v2, v0 offset:6 -; FIJI-NEXT: ds_write_b32 v2, v3 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 ; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: