diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -200,15 +200,16 @@ const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(SDValue Base, unsigned Offset, - unsigned OffsetBits) const; + bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; + bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, + unsigned Size) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0, - SDValue &Offset1, bool IsDS128) const; + SDValue &Offset1, unsigned Size) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, @@ -1155,13 +1156,11 @@ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, - unsigned OffsetBits) const { - if ((OffsetBits == 16 && !isUInt<16>(Offset)) || - (OffsetBits == 8 && !isUInt<8>(Offset))) +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const { + if (!isUInt<16>(Offset)) return false; - if (Subtarget->hasUsableDSOffset() || + if (!Base || Subtarget->hasUsableDSOffset() || Subtarget->unsafeDSOffsetFoldingEnabled()) return true; @@ -1177,7 +1176,7 @@ SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast(N1); - if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + if (isDSOffsetLegal(N0, C1->getSExtValue())) { // (add n0, c0) Base = N0; Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); @@ -1187,7 +1186,7 @@ // sub C, x -> add (sub 0, x), C if (const ConstantSDNode *C = dyn_cast(Addr.getOperand(0))) { int64_t ByteOffset = C->getSExtValue(); - if (isUInt<16>(ByteOffset)) { + if (isDSOffsetLegal(SDValue(), ByteOffset)) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check @@ -1196,7 +1195,7 @@ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); - if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + if (isDSOffsetLegal(Sub, ByteOffset)) { SmallVector Opnds; Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); @@ -1226,7 +1225,7 @@ SDLoc DL(Addr); - if (isUInt<16>(CAddr->getZExtValue())) { + if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); @@ -1242,46 +1241,63 @@ return true; } +bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, + unsigned Offset1, + unsigned Size) const { + if (Offset0 % Size != 0 || Offset1 % Size != 0) + return false; + if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) + return false; + + if (!Base || Subtarget->hasUsableDSOffset() || + Subtarget->unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + // TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { - return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, false); + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4); } bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { - return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, true); + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8); } bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1, - bool IsDS128) const { + unsigned Size) const { SDLoc DL(Addr); - unsigned Align = IsDS128 ? 8 : 4; if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast(N1); - unsigned OffsetValue0 = C1->getZExtValue() / Align; - unsigned OffsetValue1 = OffsetValue0 + 1; + unsigned OffsetValue0 = C1->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; + // (add n0, c0) - if (isDSOffsetLegal(N0, OffsetValue1, 8)) { + if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) { Base = N0; - Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } else if (Addr.getOpcode() == ISD::SUB) { // sub C, x -> add (sub 0, x), C if (const ConstantSDNode *C = dyn_cast(Addr.getOperand(0))) { - unsigned OffsetValue0 = C->getZExtValue() / Align; - unsigned OffsetValue1 = OffsetValue0 + 1; + unsigned OffsetValue0 = C->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; - if (isUInt<8>(OffsetValue0)) { + if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); @@ -1291,7 +1307,7 @@ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); - if (isDSOffsetLegal(Sub, OffsetValue1, 8)) { + if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) { SmallVector Opnds; Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); @@ -1303,27 +1319,26 @@ } MachineSDNode *MachineSub = CurDAG->getMachineNode( - SubOp, DL, (IsDS128 ? MVT::i64 : MVT::i32), Opnds); + SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds); Base = SDValue(MachineSub, 0); - Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } } } else if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { - unsigned OffsetValue0 = CAddr->getZExtValue() / Align; - unsigned OffsetValue1 = OffsetValue0 + 1; - bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue(); + unsigned OffsetValue0 = CAddr->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; - if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) { + if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(OffsetValue0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(OffsetValue1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } @@ -2412,7 +2427,7 @@ SDValue PtrOffset = Ptr.getOperand(1); const APInt &OffsetVal = cast(PtrOffset)->getAPIntValue(); - if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { + if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) { N = glueCopyToM0(N, PtrBase); Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -203,8 +203,9 @@ InstructionSelector::ComplexRendererFns selectMUBUFScratchOffset(MachineOperand &Root) const; - bool isDSOffsetLegal(Register Base, int64_t Offset, - unsigned OffsetBits) const; + bool isDSOffsetLegal(Register Base, int64_t Offset) const; + bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1, + unsigned Size) const; std::pair selectDS1Addr1OffsetImpl(MachineOperand &Root) const; @@ -217,10 +218,10 @@ InstructionSelector::ComplexRendererFns selectDS128Bit8ByteAligned(MachineOperand &Root) const; - std::pair - selectDSReadWrite2Impl(MachineOperand &Root, bool IsDS128) const; + std::pair selectDSReadWrite2Impl(MachineOperand &Root, + unsigned size) const; InstructionSelector::ComplexRendererFns - selectDSReadWrite2(MachineOperand &Root, bool IsDS128) const; + selectDSReadWrite2(MachineOperand &Root, unsigned size) const; std::pair getPtrBaseWithConstantOffset(Register Root, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1408,7 +1408,7 @@ std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); // TODO: Should this try to look through readfirstlane like GWS? - if (!isDSOffsetLegal(PtrBase, Offset, 16)) { + if (!isDSOffsetLegal(PtrBase, Offset)) { PtrBase = MI.getOperand(2).getReg(); Offset = 0; } @@ -3636,10 +3636,24 @@ } bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, - int64_t Offset, - unsigned OffsetBits) const { - if ((OffsetBits == 16 && !isUInt<16>(Offset)) || - (OffsetBits == 8 && !isUInt<8>(Offset))) + int64_t Offset) const { + if (!isUInt<16>(Offset)) + return false; + + if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return KnownBits->signBitIsZero(Base); +} + +bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, + int64_t Offset1, + unsigned Size) const { + if (Offset0 % Size != 0 || Offset1 % Size != 0) + return false; + if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) return false; if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled()) @@ -3694,7 +3708,7 @@ getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { - if (isDSOffsetLegal(PtrBase, Offset, 16)) { + if (isDSOffsetLegal(PtrBase, Offset)) { // (add n0, c0) return std::make_pair(PtrBase, Offset); } @@ -3723,20 +3737,20 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const { - return selectDSReadWrite2(Root, false); + return selectDSReadWrite2(Root, 4); } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const { - return selectDSReadWrite2(Root, true); + return selectDSReadWrite2(Root, 8); } InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root, - bool IsDS128) const { + unsigned Size) const { Register Reg; unsigned Offset; - std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128); + std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size); return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, @@ -3746,7 +3760,7 @@ std::pair AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root, - bool IsDS128) const { + unsigned Size) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); if (!RootDef) return std::make_pair(Root.getReg(), 0); @@ -3759,11 +3773,11 @@ getPtrBaseWithConstantOffset(Root.getReg(), *MRI); if (Offset) { - int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4); - int64_t OffsetValue1 = OffsetValue0 + 1; - if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) { + int64_t OffsetValue0 = Offset; + int64_t OffsetValue1 = Offset + Size; + if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) { // (add n0, c0) - return std::make_pair(PtrBase, OffsetValue0); + return std::make_pair(PtrBase, OffsetValue0 / Size); } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { // TODO diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local-128.mir @@ -102,8 +102,10 @@ ; GFX7-LABEL: name: load_local_v4s32_align_8_offset_320 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4000, implicit $exec + ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX7: $m0 = S_MOV_B32 -1 - ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 [[COPY]], 500, 501, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3) + ; GFX7: [[DS_READ2_B64_:%[0-9]+]]:vreg_128 = DS_READ2_B64 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 16, align 8, addrspace 3) ; GFX7: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[DS_READ2_B64_]] ; GFX9-LABEL: name: load_local_v4s32_align_8_offset_320 ; GFX9: liveins: $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -194,20 +194,20 @@ ; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; CI: ; %bb.0: ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7b ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255 +; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; CI-NEXT: s_endpgm ; ; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:254 offset1:255 +; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 ; GFX9-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i @@ -223,7 +223,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dword s0, s[0:1], 0x9 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 ; CI-NEXT: s_mov_b64 vcc, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0x7b ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -235,7 +235,7 @@ ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_mov_b32 s1, s0 -; CI-NEXT: ds_write2_b32 v0, v2, v3 offset0:254 offset1:255 +; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; CI-NEXT: s_endpgm ; @@ -244,13 +244,13 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 vcc, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset0:254 offset1:255 +; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -693,8 +693,8 @@ ; GFX9-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-UNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v0, s0, v2 -; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset0:1 offset1:2 +; GFX9-UNALIGNED-NEXT: v_add3_u32 v0, s0, v2, 5 +; GFX9-UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-UNALIGNED-NEXT: global_store_dword v2, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -701,9 +701,11 @@ ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, s0, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v3, 5, v2 +; GFX9-UNALIGNED-NEXT: v_add_u32_e32 v2, 9, v2 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:1 offset1:2 -; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset0:2 offset1:3 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 +; GFX9-UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i