Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,24 @@ // ==> // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 // +// This pass also tries to promote constant offset to the immediate by +// re-computing the base and relative offset from the nearby instructions. +// E.g. +// s_movk_i32 s0, 0x1800 +// v_add_co_u32_e32 v0, vcc, s0, v2 +// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[0:1], off +// => +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 // // Future improvements: // @@ -102,6 +120,21 @@ SmallVector InstsToMove; }; + struct BaseRegisters { + unsigned LoReg = 0; + unsigned HiReg = 0; + + unsigned LoSubReg = 0; + unsigned HiSubReg = 0; + }; + + struct MemAddress { + BaseRegisters Base; + int64_t Offset = 0; + }; + + using MemInfoMap = DenseMap; + private: const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; @@ -127,6 +160,14 @@ bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, int32_t NewOffset); + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); + Optional extractConstOffset(const MachineOperand &Op); + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + bool promoteConstantOffsetToImm(MachineInstr &CI, + MemInfoMap &Visited, + SmallPtrSet &Promoted); public: static char ID; @@ -826,15 +867,277 @@ return Next; } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { + APInt V(32, Val, true); + if (TII->isInlineConstant(V)) + return MachineOperand::CreateImm(Val); + + unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Mov = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Val); + LLVM_DEBUG(dbgs() << " "; Mov->dump()); + return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, + const MemAddress &Addr) { + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || + Addr.Base.LoSubReg) && + "Expected 32-bit Base-Register-Low!!"); + + assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || + Addr.Base.HiSubReg) && + "Expected 32-bit Base-Register-Hi!!"); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + MachineOperand OffsetLo = createRegOrImm(static_cast(Addr.Offset), MI); + MachineOperand OffsetHi = createRegOrImm(static_cast(Addr.Offset >> 32), MI); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *LoHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) + .add(OffsetLo); + LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + + MachineInstr *HiHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill); + LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + + unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + MachineInstr *FullBase = + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + + return FullDestReg; +} + +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, + unsigned NewBase, + int32_t NewOffset) { + TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); + TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg()) + return None; + + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || + !Def->getOperand(1).isImm()) + return None; + + return Def->getOperand(1).getImm(); +} + +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, + MemAddress &Addr) { + if (!Base.isReg()) + return; + + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5) + return; + + MachineOperand BaseLo = Def->getOperand(1); + MachineOperand BaseHi = Def->getOperand(3); + if (!BaseLo.isReg() || !BaseHi.isReg()) + return; + + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + + if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || + !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + return; + + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + + auto Offset0P = extractConstOffset(*Src0); + if (Offset0P) + BaseLo = *Src1; + else { + if (!(Offset0P = extractConstOffset(*Src1))) + return; + BaseLo = *Src0; + } + + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + + if (Src0->isImm()) + std::swap(Src0, Src1); + + if (!Src1->isImm()) + return; + + int64_t Offset1 = Src1->getImm(); + BaseHi = *Src0; + + Addr.Base.LoReg = BaseLo.getReg(); + Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoSubReg = BaseLo.getSubReg(); + Addr.Base.HiSubReg = BaseHi.getSubReg(); + Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( + MachineInstr &MI, + MemInfoMap &Visited, + SmallPtrSet &AnchorList) { + + // TODO: Support flat and scratch. + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + return false; + + // TODO: Support Store. + if (!MI.mayLoad()) + return false; + + if (AnchorList.count(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + + if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { + LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); + return false; + } + + MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MemAddress *MAddr; + if (Visited.find(&MI) == Visited.end()) { + MAddr = new MemAddress(); + processBaseWithConstOffset(Base, *MAddr); + Visited[&MI] = MAddr; + } else + MAddr = Visited[&MI]; + + if (MAddr->Offset == 0) { + LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or" + " there are no constant offsets that can be promoted.\n";); + return false; + } + + LLVM_DEBUG(dbgs() << " BASE: {" << MAddr->Base.HiReg << ", " + << MAddr->Base.LoReg << "} Offset: " << MAddr->Offset << "\n\n";); + + // Find an anchor with the highest distance. + MachineInstr *AnchorInst = nullptr; + MemAddress AnchorAddr; + uint32_t MaxDist = std::numeric_limits::min(); + SmallVector, 4> InstsWCommonBase; + + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + ++MBBI; + + for ( ; MBBI != E; ++MBBI) { + MachineInstr &MINext = *MBBI; + // TODO: Support finding anchor from store address. + if (MINext.getOpcode() != MI.getOpcode() || + TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) + continue; + + const MachineOperand &BaseNext = *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); + MemAddress *MAddrNext; + if (Visited.find(&MINext) == Visited.end()) { + MAddrNext = new MemAddress(); + processBaseWithConstOffset(BaseNext, *MAddrNext); + Visited[&MINext] = MAddrNext; + } else + MAddrNext = Visited[&MINext]; + + if (MAddrNext->Base.LoReg != MAddr->Base.LoReg || + MAddrNext->Base.HiReg != MAddr->Base.HiReg || + MAddrNext->Base.LoSubReg != MAddr->Base.LoSubReg || + MAddrNext->Base.HiSubReg != MAddr->Base.HiSubReg) + continue; + + InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext->Offset)); + + int64_t Dist = MAddr->Offset - MAddrNext->Offset; + if (isInt<13>(Dist) && + (uint32_t)abs(Dist) > MaxDist) { + MaxDist = abs(Dist); + + AnchorAddr = *MAddrNext; + AnchorInst = &MINext; + } + } + + if (AnchorInst) { + LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; + AnchorInst->dump()); + LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " << AnchorAddr.Offset << "\n\n"); + + // Instead of moving up, just re-compute anchor-instruction's base address. + unsigned Base = computeBase(MI, AnchorAddr); + + updateBaseAndOffset(MI, Base, MAddr->Offset - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); + + for (auto P : InstsWCommonBase) + if (isInt<13>(P.second - AnchorAddr.Offset)) { + LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; + dbgs() << ")"; P.first->dump()); + updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); + } + + AnchorList.insert(AnchorInst); + return true; + } + + return false; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { bool Modified = false; + // Contain the list + MemInfoMap Visited; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. + SmallPtrSet AnchorList; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I; + if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) + Modified = true; + // Don't combine if volatile. if (MI.hasOrderedMemoryRef()) { ++I; @@ -934,6 +1237,11 @@ ++I; } + // Release memory. + for (MemInfoMap::iterator It = Visited.begin(), E = Visited.end(); + It != E; ++It) + delete (*It).second; + return Modified; } Index: promote-constOffset-to-imm.ll =================================================================== --- /dev/null +++ promote-constOffset-to-imm.ll @@ -0,0 +1,1288 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +declare i64 @_Z13get_global_idj(i32) + +define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: clmem_read_simplified: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x800 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v18, v1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: clmem_read_simplified: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s35 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s34, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[7:8], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[9:10], v[0:1], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[11:12], v[5:6], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[13:14], v[5:6], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[15:16], v[5:6], off +; GFX9-NEXT: s_movk_i32 s0, 0x3800 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[17:18], v[0:1], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v8, vcc +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v11, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v12, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v17, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v18, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add.1 = add i64 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %load3, %add.1 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %load5, %add.3 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + store i64 %add.7, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: clmem_read: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 17, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_and_b32_e32 v2, 0xfe000000, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s34, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x5000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_movk_i32 s7, 0xb800 +; GFX8-NEXT: s_movk_i32 s8, 0xc000 +; GFX8-NEXT: s_movk_i32 s9, 0xc800 +; GFX8-NEXT: s_movk_i32 s10, 0xd000 +; GFX8-NEXT: s_movk_i32 s11, 0xd800 +; GFX8-NEXT: s_movk_i32 s12, 0xe000 +; GFX8-NEXT: s_movk_i32 s13, 0xe800 +; GFX8-NEXT: s_movk_i32 s14, 0xf000 +; GFX8-NEXT: s_movk_i32 s15, 0xf800 + +; GFX8: v_mov_b32_e32 v7, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, v2 +; GFX8-NEXT: s_mov_b32 s16, 0 +; GFX8: v_add_u32_e32 v8, vcc, 0xffffb000, v6 +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s7, v6 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s8, v6 +; GFX8-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-NEXT: v_addc_u32_e64 v9, vcc, -1, v7, s[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, s9, v6 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v7, vcc +; GFX8-NEXT: v_addc_u32_e64 v13, vcc, -1, v7, s[4:5] +; GFX8-NEXT: v_addc_u32_e64 v11, vcc, -1, v7, s[2:3] +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s10, v6 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: s_mov_b64 s[0:1], vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s11, v6 +; GFX8-NEXT: s_mov_b64 s[2:3], vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s12, v6 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v7, vcc +; GFX8-NEXT: v_addc_u32_e64 v19, vcc, -1, v7, s[2:3] +; GFX8-NEXT: v_addc_u32_e64 v17, vcc, -1, v7, s[0:1] +; GFX8-NEXT: v_add_u32_e32 v22, vcc, s13, v6 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] +; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[6:7] +; GFX8-NEXT: s_addk_i32 s16, 0x2000 +; GFX8-NEXT: s_cmp_lt_u32 s16, 0x400000 +; GFX8-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v8, v4 +; GFX8-NEXT: v_addc_u32_e32 v27, vcc, v9, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s14, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s15, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x10000, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v26 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v27, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v13, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v16, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v17, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v20, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v21, v11, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v22, v10 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v23, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v24, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v25, v5, vcc +; GFX8: s_add_i32 s0, s6, -1 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_mov_b32 s6, s0 +; GFX8: flat_store_dwordx2 v[0:1], v[4:5] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: clmem_read: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX9-NEXT: v_lshlrev_b64 v[3:4], 3, v[1:2] +; GFX9-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x5000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_movk_i32 s2, 0x7f +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_movk_i32 s3, 0xc800 +; GFX9-NEXT: s_movk_i32 s4, 0xe800 +; GFX9: v_mov_b32_e32 v7, v3 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9: v_add_co_u32_e32 v8, vcc, 0xffffb000, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, s3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX9-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[22:23], v[14:15], off offset:-2048 +; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], s4, v6 +; GFX9-NEXT: global_load_dwordx2 v[24:25], v[14:15], off +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], -1, v7, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[18:19], v[10:11], off +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[14:15], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[10:11], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 +; GFX9-NEXT: s_addk_i32 s5, 0x2000 +; GFX9-NEXT: s_cmp_lt_u32 s5, 0x400000 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, v9, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[6:7], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v27, vcc +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v22, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, v23, v21, vcc +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v24, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, v25, v21, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, v14, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v21, vcc +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v16, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v17, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v18, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v19, v11, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX9: s_add_i32 s0, s2, -1 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_mov_b32 s2, s0 +; GFX9: global_store_dwordx2 v[0:1], v[4:5], s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 17 + %idx.ext11 = and i64 %a0, 4261412864 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv + br label %for.cond.preheader + +while.cond.loopexit: ; preds = %for.body + %dec = add nsw i32 %dec31, -1 + %tobool = icmp eq i32 %dec31, 0 + br i1 %tobool, label %while.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %entry, %while.cond.loopexit + %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] + %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] + br label %for.body + +for.body: ; preds = %for.body, %for.cond.preheader + %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] + %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] + %conv3 = zext i32 %block.029 to i64 + %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 + %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 + %add = add i64 %load1, %sum.128 + + %add9 = or i32 %block.029, 256 + %conv3.1 = zext i32 %add9 to i64 + %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 + %add.1 = add i64 %load2, %add + + %add9.1 = or i32 %block.029, 512 + %conv3.2 = zext i32 %add9.1 to i64 + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 + %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %l3, %add.1 + + %add9.2 = or i32 %block.029, 768 + %conv3.3 = zext i32 %add9.2 to i64 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 + %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %l4, %add.2 + + %add9.3 = or i32 %block.029, 1024 + %conv3.4 = zext i32 %add9.3 to i64 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 + %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %l5, %add.3 + + %add9.4 = or i32 %block.029, 1280 + %conv3.5 = zext i32 %add9.4 to i64 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 + %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %l6, %add.4 + + %add9.5 = or i32 %block.029, 1536 + %conv3.6 = zext i32 %add9.5 to i64 + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + + %add9.6 = or i32 %block.029, 1792 + %conv3.7 = zext i32 %add9.6 to i64 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + %add9.7 = or i32 %block.029, 2048 + %conv3.8 = zext i32 %add9.7 to i64 + %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 + %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 + %add.8 = add i64 %load9, %add.7 + + %add9.8 = or i32 %block.029, 2304 + %conv3.9 = zext i32 %add9.8 to i64 + %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 + %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 + %add.9 = add i64 %load10, %add.8 + + %add9.9 = or i32 %block.029, 2560 + %conv3.10 = zext i32 %add9.9 to i64 + %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 + %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 + %add.10 = add i64 %load11, %add.9 + + %add9.31 = add nuw nsw i32 %block.029, 8192 + %cmp.31 = icmp ult i32 %add9.31, 4194304 + br i1 %cmp.31, label %for.body, label %while.cond.loopexit + +while.end: ; preds = %while.cond.loopexit + store i64 %add.10, i64 addrspace(1)* %a1, align 8 + ret void +} + +; using 32bit address. +define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: Address32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x400, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x800, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xc00, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x1000, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x1400, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x1800, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x1c00, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x2000, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x2400, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: Address32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s35 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s34, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[0:1], off offset:1024 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x2400, v0 +; GFX9-NEXT: global_load_dword v10, v[0:1], off offset:2048 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:3072 +; GFX9-NEXT: global_load_dword v1, v[7:8], off +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v2, v9, v2 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v2, v10, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 +; GFX9-NEXT: global_load_dword v2, v[5:6], off offset:-4096 +; GFX9-NEXT: global_load_dword v7, v[5:6], off offset:-3072 +; GFX9-NEXT: global_load_dword v8, v[5:6], off offset:-2048 +; GFX9-NEXT: global_load_dword v9, v[5:6], off offset:-1024 +; GFX9-NEXT: global_load_dword v5, v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, v8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, v9, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: global_store_dword v[3:4], v0, s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %id = shl i64 %call, 7 + %idx.ext11 = and i64 %id, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 + + %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 + %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 + %add.1 = add i32 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 + %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 + %add.2 = add i32 %load3, %add.1 + + %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 + %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 + %add.3 = add i32 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 + %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 + %add.4 = add i32 %load5, %add.3 + + %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 + %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 + %add.5 = add i32 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 + %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 + %add.6 = add i32 %load7, %add.5 + + %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 + %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 + %add.7 = add i32 %load8, %add.6 + + %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 + %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 + %add.8 = add i32 %load9, %add.7 + + %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 + %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 + %add.9 = add i32 %load10, %add.8 + + store i32 %add.9, i32 addrspace(1)* %addr, align 4 + ret void +} + +define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: Offset64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0xf800 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 1, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s35 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s34, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[7:8], v[5:6], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[9:10], v[5:6], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v9, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v10, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + + %add1 = add i64 %load2, %load1 + + %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 + %load3 = load i64, i64 addrspace(1)* %addr3, align 8 + + %add2 = add i64 %load3, %add1 + + %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 + %load4 = load i64, i64 addrspace(1)* %addr4, align 8 + %add4 = add i64 %load4, %add2 + + store i64 %add4, i64 addrspace(1)* %saddr, align 8 + ret void +} + +; TODO: Support load4 as anchor instruction. +define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: p32Offset64: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7ffff800, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7ffffc00, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80000000, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: p32Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s35 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s34, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] +; GFX9-NEXT: s_mov_b32 s0, 0x7ffffc00 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[7:8], off +; GFX9-NEXT: global_load_dword v2, v[5:6], off offset:-1024 +; GFX9-NEXT: global_load_dword v5, v[5:6], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, v2, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 +; GFX9-NEXT: global_store_dword v[3:4], v0, s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 + %load2 = load i32, i32 addrspace(1)* %addr2, align 8 + + %add1 = add i32 %load2, %load1 + + %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 + %load3 = load i32, i32 addrspace(1)* %addr3, align 8 + + %add2 = add i32 %load3, %add1 + + %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 + %load4 = load i32, i32 addrspace(1)* %addr4, align 8 + %add4 = add i32 %load4, %add2 + + store i32 %add4, i32 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, +; GFX8-LABEL: DiffBase: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s42, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s43, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX8-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s36, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s39 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s0, v12 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v12 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v12 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v8 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v7, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: DiffBase: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s37 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s36, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s39 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, s38, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v2, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x3800 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, s0, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[8:9], off offset:-4096 +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v11, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], s[36:37] +; GFX9-NEXT: s_endpgm + i8 addrspace(1)* %buffer2) { +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 + %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add1 = add i64 %load2, %load1 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add2 = add i64 %load3, %add1 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add3 = add i64 %load5, %load4 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add4 = add i64 %load6, %add3 + + %add5 = add i64 %add2, %add4 + + store i64 %add5, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { +; GFX8-LABEL: ReverseOrder: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX8-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX8-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s38, -1 +; GFX8-NEXT: s_mov_b32 s33, s3 +; GFX8-NEXT: s_mov_b32 s39, 0xe80000 +; GFX8-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX8-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX8-NEXT: s_mov_b32 s4, s33 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: s_getpc_b64 s[6:7] +; GFX8-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX8-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX8-NEXT: s_mov_b32 s32, s33 +; GFX8-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x800 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] +; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] +; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] +; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v12, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v14, v1, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v16, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v18, v1, vcc +; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: ReverseOrder: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s33, s3 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_getpc_b64 s[6:7] +; GFX9-NEXT: s_add_u32 s6, s6, _Z13get_global_idj@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s7, s7, _Z13get_global_idj@rel32@hi+4 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff8000, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s35 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s34, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[7:8], v[5:6], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[9:10], v[5:6], off +; GFX9-NEXT: global_load_dwordx2 v[11:12], v[5:6], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[13:14], v[0:1], off +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: global_load_dwordx2 v[15:16], v[0:1], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[5:6], off offset:-4096 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[17:18], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:2048 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v11, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v9, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v17, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v18, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v15, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v16, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add7 = add i64 %load8, %load1 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add6 = add i64 %load7, %add7 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add5 = add i64 %load6, %add6 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add4 = add i64 %load5, %add5 + + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add3 = add i64 %load4, %add4 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add2 = add i64 %load3, %add3 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add1 = add i64 %load2, %add2 + + store i64 %add1, i64 addrspace(1)* %saddr, align 8 + ret void +} Index: promote-constOffset-to-imm.mir =================================================================== --- /dev/null +++ promote-constOffset-to-imm.mir @@ -0,0 +1,125 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt,dead-mi-elimination -o - %s | FileCheck -check-prefix=GFX9 %s + +--- | + declare i64 @_Z13get_global_idj(i32) + declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + + define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* nocapture %buffer) { + entry: + %clmem_read_simplified.kernarg.segment = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %buffer.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %clmem_read_simplified.kernarg.segment, i64 36 + %buffer.kernarg.offset.cast = bitcast i8 addrspace(4)* %buffer.kernarg.offset to i8 addrspace(1)* addrspace(4)* + %buffer.load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %buffer.kernarg.offset.cast, align 4 + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %0 = shl i64 %call, 7 + %idx.ext11 = and i64 %0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer.load, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load1 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add = add i64 %load1, %load2 + + store i64 %add, i64 addrspace(1)* %saddr, align 8 + ret void + } + +... +--- + +name: clmem_read_simplified +body: | + bb.0.entry: + liveins: $sgpr0_sgpr1 + ; GFX9-LABEL: name: clmem_read_simplified + ; GFX9: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1 + ; GFX9: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0, 0 + ; GFX9: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + ; GFX9: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 4, implicit-def dead $scc + ; GFX9: [[COPY1:%[0-9]+]]:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr101 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; GFX9: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]] + ; GFX9: $sgpr4 = COPY [[COPY2]] + ; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @_Z13get_global_idj, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit-def $vgpr0_vgpr1 + ; GFX9: ADJCALLSTACKDOWN 0, 4, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + ; GFX9: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 255, [[COPY3]].sub0, implicit $exec + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_AND_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX9: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 7, [[COPY3]].sub0, implicit $exec + ; GFX9: [[V_AND_B32_e32_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 -32768, killed [[V_LSHLREV_B32_e64_]], implicit $exec + ; GFX9: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e32_1]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; GFX9: [[COPY4:%[0-9]+]]:sgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX9: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX9: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_AND_B32_e32_1]], implicit $exec + ; GFX9: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; GFX9: %18:vgpr_32, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[COPY6]], [[COPY5]], killed [[V_ADD_I32_e64_1]], implicit $exec + ; GFX9: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 3, [[REG_SEQUENCE]], implicit $exec + ; GFX9: [[V_ADD_I32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[V_ADD_I32_e64_]], [[V_LSHLREV_B64_]].sub0, implicit $exec + ; GFX9: %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %18, [[V_LSHLREV_B64_]].sub1, killed [[V_ADD_I32_e64_3]], implicit $exec + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 6144 + ; GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[V_ADD_I32_e64_2]], [[S_MOV_B32_1]], implicit $exec + ; GFX9: %48:vgpr_32, dead %46:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed [[V_ADD_I32_e64_5]], implicit $exec + ; GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, %48, %subreg.sub1 + ; GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0, 0, implicit $exec :: (load 8 from %ir.add.ptr8.2, addrspace 1) + ; GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.add.ptr8.3, addrspace 1) + ; GFX9: [[V_ADD_I32_e64_6:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 [[GLOBAL_LOAD_DWORDX2_1]].sub0, [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec + ; GFX9: %41:vgpr_32, dead %42:sreg_64_xexec = V_ADDC_U32_e64 [[GLOBAL_LOAD_DWORDX2_1]].sub1, [[GLOBAL_LOAD_DWORDX2_]].sub1, killed [[V_ADD_I32_e64_7]], implicit $exec + ; GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_6]], %subreg.sub0, %41, %subreg.sub1 + ; GFX9: GLOBAL_STORE_DWORDX2_SADDR [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], [[S_LOAD_DWORDX2_IMM]], 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8 into %ir.saddr, addrspace 1) + ; GFX9: S_ENDPGM + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0, 0 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + %5:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @_Z13get_global_idj + 4, target-flags(amdgpu-rel32-hi) @_Z13get_global_idj + 4, implicit-def dead $scc + %6:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %7:sreg_32_xm0 = COPY $sgpr101 + %8:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %6:sreg_128 + $sgpr4 = COPY %7:sreg_32_xm0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $sgpr30_sgpr31 = SI_CALL killed %5:sreg_64, @_Z13get_global_idj, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit-def $vgpr0_vgpr1 + ADJCALLSTACKDOWN 0, 4, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + + %44:vreg_64 = COPY $vgpr0_vgpr1 + %12:vgpr_32 = V_AND_B32_e32 255, %44.sub0:vreg_64, implicit $exec + %47:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %46:vreg_64 = REG_SEQUENCE killed %12:vgpr_32, %subreg.sub0, %47:vgpr_32, %subreg.sub1 + %16:vgpr_32 = V_LSHLREV_B32_e64 7, %44.sub0:vreg_64, implicit $exec + %19:vgpr_32 = V_AND_B32_e32 -32768, killed %16:vgpr_32, implicit $exec + %48:vreg_64 = REG_SEQUENCE %19:vgpr_32, %subreg.sub0, %47:vgpr_32, %subreg.sub1 + %57:sgpr_32 = COPY %4.sub1:sreg_64_xexec + %58:vgpr_32 = COPY %8:sreg_32_xm0 + %51:vgpr_32, %53:sreg_64_xexec = V_ADD_I32_e64 %4.sub0:sreg_64_xexec, %19:vgpr_32, implicit $exec + %59:vgpr_32 = COPY %57:sgpr_32 + %52:vgpr_32, dead %54:sreg_64_xexec = V_ADDC_U32_e64 %59:vgpr_32, %58:vgpr_32, killed %53:sreg_64_xexec, implicit $exec + %24:vreg_64 = V_LSHLREV_B64 3, %46:vreg_64, implicit $exec + %61:vgpr_32, %63:sreg_64_xexec = V_ADD_I32_e64 %51:vgpr_32, %24.sub0:vreg_64, implicit $exec + %62:vgpr_32, dead %64:sreg_64_xexec = V_ADDC_U32_e64 %52:vgpr_32, %24.sub1:vreg_64, killed %63:sreg_64_xexec, implicit $exec + + %86:sgpr_32 = S_MOV_B32 4096 + %81:vgpr_32, %83:sreg_64_xexec = V_ADD_I32_e64 %86:sgpr_32, %61:vgpr_32, implicit $exec + %82:vgpr_32, dead %84:sreg_64_xexec = V_ADDC_U32_e64 %62:vgpr_32, 0, killed %83:sreg_64_xexec, implicit $exec + %80:vreg_64 = REG_SEQUENCE %81:vgpr_32, %subreg.sub0, %82:vgpr_32, %subreg.sub1 + %32:vreg_64 = GLOBAL_LOAD_DWORDX2 %80:vreg_64, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.add.ptr8.2, addrspace 1) + %76:sgpr_32 = S_MOV_B32 6144 + + %71:vgpr_32, %73:sreg_64_xexec = V_ADD_I32_e64 %61:vgpr_32, %76:sgpr_32, implicit $exec + %72:vgpr_32, dead %74:sreg_64_xexec = V_ADDC_U32_e64 %62:vgpr_32, 0, killed %73:sreg_64_xexec, implicit $exec + %70:vreg_64 = REG_SEQUENCE %71:vgpr_32, %subreg.sub0, %72:vgpr_32, %subreg.sub1 + %37:vreg_64 = GLOBAL_LOAD_DWORDX2 %70:vreg_64, 0, 0, 0, 0, implicit $exec :: (load 8 from %ir.add.ptr8.3, addrspace 1) + + %93:vgpr_32, %95:sreg_64_xexec = V_ADD_I32_e64 %37.sub0:vreg_64, %32.sub0:vreg_64, implicit $exec + %94:vgpr_32, dead %96:sreg_64_xexec = V_ADDC_U32_e64 %37.sub1:vreg_64, %32.sub1:vreg_64, killed %95:sreg_64_xexec, implicit $exec + %92:vreg_64 = REG_SEQUENCE %93:vgpr_32, %subreg.sub0, %94:vgpr_32, %subreg.sub1 + GLOBAL_STORE_DWORDX2_SADDR %48:vreg_64, %92:vreg_64, %4:sreg_64_xexec, 0, 0, 0, 0, implicit $exec, implicit $exec :: (store 8 into %ir.saddr, addrspace 1) + S_ENDPGM +...