Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -170,7 +170,6 @@ SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; - bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; @@ -212,6 +211,7 @@ SmallVectorImpl &/*Ops*/, Type *&/*AccessTy*/) const override; + bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,26 @@ // ==> // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 // +// This pass also tries to promote constant offset to the immediate by +// adjusting the base. It tries to use a base from the nearby instructions that +// allows it to have a 13bit constant offset and then promotes the 13bit offset +// to the immediate. +// E.g. +// s_movk_i32 s0, 0x1800 +// v_add_co_u32_e32 v0, vcc, s0, v2 +// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[0:1], off +// => +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 // // Future improvements: // @@ -116,6 +136,21 @@ SmallVector InstsToMove; }; + struct BaseRegisters { + unsigned LoReg = 0; + unsigned HiReg = 0; + + unsigned LoSubReg = 0; + unsigned HiSubReg = 0; + }; + + struct MemAddress { + BaseRegisters Base; + int64_t Offset = 0; + }; + + using MemInfoMap = DenseMap; + private: const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; @@ -146,6 +181,19 @@ MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, + int32_t NewOffset); + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); + Optional extractConstOffset(const MachineOperand &Op); + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + /// Promotes constant offset to the immediate by adjusting the base. It + /// tries to use a base from the nearby instructions that allows it to have + /// a 13bit constant offset which gets promoted to the immediate. + bool promoteConstantOffsetToImm(MachineInstr &CI, + MemInfoMap &Visited, + SmallPtrSet &Promoted); + public: static char ID; @@ -1053,15 +1101,328 @@ return Next; } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { + APInt V(32, Val, true); + if (TII->isInlineConstant(V)) + return MachineOperand::CreateImm(Val); + + unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Mov = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Val); + LLVM_DEBUG(dbgs() << " "; Mov->dump()); + return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, + const MemAddress &Addr) { + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || + Addr.Base.LoSubReg) && + "Expected 32-bit Base-Register-Low!!"); + + assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || + Addr.Base.HiSubReg) && + "Expected 32-bit Base-Register-Hi!!"); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + MachineOperand OffsetLo = createRegOrImm(static_cast(Addr.Offset), MI); + MachineOperand OffsetHi = + createRegOrImm(static_cast(Addr.Offset >> 32), MI); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = + MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *LoHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) + .add(OffsetLo); + LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + + MachineInstr *HiHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill); + LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + + unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + MachineInstr *FullBase = + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + + return FullDestReg; +} + +// Update base and offset with the NewBase and NewOffset in MI. +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, + unsigned NewBase, + int32_t NewOffset) { + TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); + TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg()) + return None; + + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || + !Def->getOperand(1).isImm()) + return None; + + return Def->getOperand(1).getImm(); +} + +// Analyze Base and extracts: +// - 32bit base registers, subregisters +// - 64bit constant offset +// Expecting base computation as: +// %OFFSET0:sgpr_32 = S_MOV_B32 8000 +// %LO:vgpr_32, %c:sreg_64_xexec = +// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, +// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec +// %Base:vreg_64 = +// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, + MemAddress &Addr) { + if (!Base.isReg()) + return; + + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE + || Def->getNumOperands() != 5) + return; + + MachineOperand BaseLo = Def->getOperand(1); + MachineOperand BaseHi = Def->getOperand(3); + if (!BaseLo.isReg() || !BaseHi.isReg()) + return; + + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + + if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || + !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + return; + + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + + auto Offset0P = extractConstOffset(*Src0); + if (Offset0P) + BaseLo = *Src1; + else { + if (!(Offset0P = extractConstOffset(*Src1))) + return; + BaseLo = *Src0; + } + + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + + if (Src0->isImm()) + std::swap(Src0, Src1); + + if (!Src1->isImm()) + return; + + assert(isInt<32>(*Offset0P) && isInt<32>(Src1->getImm()) + && "Expected 32bit immediate!!!"); + uint64_t Offset1 = Src1->getImm(); + BaseHi = *Src0; + + Addr.Base.LoReg = BaseLo.getReg(); + Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoSubReg = BaseLo.getSubReg(); + Addr.Base.HiSubReg = BaseHi.getSubReg(); + Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( + MachineInstr &MI, + MemInfoMap &Visited, + SmallPtrSet &AnchorList) { + + // TODO: Support flat and scratch. + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + return false; + + // TODO: Support Store. + if (!MI.mayLoad()) + return false; + + if (AnchorList.count(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + + if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { + LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); + return false; + } + + // Step1: Find the base-registers and a 64bit constant offset. + MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MemAddress MAddr; + if (Visited.find(&MI) == Visited.end()) { + processBaseWithConstOffset(Base, MAddr); + Visited[&MI] = MAddr; + } else + MAddr = Visited[&MI]; + + if (MAddr.Offset == 0) { + LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" + " constant offsets that can be promoted.\n";); + return false; + } + + LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " + << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + + // Step2: Traverse through MI's basic block and find an anchor(that has the + // same base-registers) with the highest 13bit distance from MI's offset. + // E.g. (64bit loads) + // bb: + // addr1 = &a + 4096; load1 = load(addr1, 0) + // addr2 = &a + 6144; load2 = load(addr2, 0) + // addr3 = &a + 8192; load3 = load(addr3, 0) + // addr4 = &a + 10240; load4 = load(addr4, 0) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + // Starting from the first load, the optimization will try to find a new base + // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 + // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 + // as the new-base(anchor) because of the maximum distance which can + // accomodate more intermediate bases presumeably. + // + // Step3: move (&a + 8192) above load1. Compute and promote offsets from + // (&a + 8192) for load1, load2, load4. + // addr = &a + 8192 + // load1 = load(addr, -4096) + // load2 = load(addr, -2048) + // load3 = load(addr, 0) + // load4 = load(addr, 2048) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + MachineInstr *AnchorInst = nullptr; + MemAddress AnchorAddr; + uint32_t MaxDist = std::numeric_limits::min(); + SmallVector, 4> InstsWCommonBase; + + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + ++MBBI; + const SITargetLowering *TLI = + static_cast(STM->getTargetLowering()); + + for ( ; MBBI != E; ++MBBI) { + MachineInstr &MINext = *MBBI; + // TODO: Support finding an anchor(with same base) from store addresses or + // any other load addresses where the opcodes are different. + if (MINext.getOpcode() != MI.getOpcode() || + TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) + continue; + + const MachineOperand &BaseNext = + *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); + MemAddress MAddrNext; + if (Visited.find(&MINext) == Visited.end()) { + processBaseWithConstOffset(BaseNext, MAddrNext); + Visited[&MINext] = MAddrNext; + } else + MAddrNext = Visited[&MINext]; + + if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || + MAddrNext.Base.HiReg != MAddr.Base.HiReg || + MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || + MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) + continue; + + InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + + int64_t Dist = MAddr.Offset - MAddrNext.Offset; + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = Dist; + if (TLI->isLegalGlobalAddressingMode(AM) && + (uint32_t)abs(Dist) > MaxDist) { + MaxDist = abs(Dist); + + AnchorAddr = MAddrNext; + AnchorInst = &MINext; + } + } + + if (AnchorInst) { + LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; + AnchorInst->dump()); + LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " + << AnchorAddr.Offset << "\n\n"); + + // Instead of moving up, just re-compute anchor-instruction's base address. + unsigned Base = computeBase(MI, AnchorAddr); + + updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); + + for (auto P : InstsWCommonBase) { + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = P.second - AnchorAddr.Offset; + + if (TLI->isLegalGlobalAddressingMode(AM)) { + LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; + dbgs() << ")"; P.first->dump()); + updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); + } + } + AnchorList.insert(AnchorInst); + return true; + } + + return false; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { bool Modified = false; + // Contain the list + MemInfoMap Visited; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. + SmallPtrSet AnchorList; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I; + if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) + Modified = true; + // Don't combine if volatile. if (MI.hasOrderedMemoryRef()) { ++I; Index: llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -0,0 +1,485 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +declare i64 @_Z13get_global_idj(i32) + +define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { +; GCN-LABEL: clmem_read_simplified: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add.1 = add i64 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %load3, %add.1 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %load5, %add.3 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + store i64 %add.7, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { +; GCN-LABEL: clmem_read: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 17 + %idx.ext11 = and i64 %a0, 4261412864 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + %add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv + br label %for.cond.preheader + +while.cond.loopexit: ; preds = %for.body + %dec = add nsw i32 %dec31, -1 + %tobool = icmp eq i32 %dec31, 0 + br i1 %tobool, label %while.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %entry, %while.cond.loopexit + %dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ] + %sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ] + br label %for.body + +for.body: ; preds = %for.body, %for.cond.preheader + %block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ] + %sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ] + %conv3 = zext i32 %block.029 to i64 + %add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3 + %load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8 + %add = add i64 %load1, %sum.128 + + %add9 = or i32 %block.029, 256 + %conv3.1 = zext i32 %add9 to i64 + %add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8 + %add.1 = add i64 %load2, %add + + %add9.1 = or i32 %block.029, 512 + %conv3.2 = zext i32 %add9.1 to i64 + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2 + %l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add.2 = add i64 %l3, %add.1 + + %add9.2 = or i32 %block.029, 768 + %conv3.3 = zext i32 %add9.2 to i64 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3 + %l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add.3 = add i64 %l4, %add.2 + + %add9.3 = or i32 %block.029, 1024 + %conv3.4 = zext i32 %add9.3 to i64 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4 + %l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add.4 = add i64 %l5, %add.3 + + %add9.4 = or i32 %block.029, 1280 + %conv3.5 = zext i32 %add9.4 to i64 + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5 + %l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add.5 = add i64 %l6, %add.4 + + %add9.5 = or i32 %block.029, 1536 + %conv3.6 = zext i32 %add9.5 to i64 + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add.6 = add i64 %load7, %add.5 + + %add9.6 = or i32 %block.029, 1792 + %conv3.7 = zext i32 %add9.6 to i64 + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add.7 = add i64 %load8, %add.6 + + %add9.7 = or i32 %block.029, 2048 + %conv3.8 = zext i32 %add9.7 to i64 + %add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8 + %load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8 + %add.8 = add i64 %load9, %add.7 + + %add9.8 = or i32 %block.029, 2304 + %conv3.9 = zext i32 %add9.8 to i64 + %add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9 + %load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8 + %add.9 = add i64 %load10, %add.8 + + %add9.9 = or i32 %block.029, 2560 + %conv3.10 = zext i32 %add9.9 to i64 + %add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10 + %load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8 + %add.10 = add i64 %load11, %add.9 + + %add9.31 = add nuw nsw i32 %block.029, 8192 + %cmp.31 = icmp ult i32 %add9.31, 4194304 + br i1 %cmp.31, label %for.body, label %while.cond.loopexit + +while.end: ; preds = %while.cond.loopexit + store i64 %add.10, i64 addrspace(1)* %a1, align 8 + ret void +} + +; using 32bit address. +define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { +; GCN-LABEL: Address32: +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %id = shl i64 %call, 7 + %idx.ext11 = and i64 %id, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4 + + %add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256 + %load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4 + %add.1 = add i32 %load2, %load1 + + %add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512 + %load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4 + %add.2 = add i32 %load3, %add.1 + + %add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768 + %load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4 + %add.3 = add i32 %load4, %add.2 + + %add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024 + %load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4 + %add.4 = add i32 %load5, %add.3 + + %add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280 + %load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4 + %add.5 = add i32 %load6, %add.4 + + %add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536 + %load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4 + %add.6 = add i32 %load7, %add.5 + + %add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792 + %load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4 + %add.7 = add i32 %load8, %add.6 + + %add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048 + %load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4 + %add.8 = add i32 %load9, %add.7 + + %add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304 + %load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4 + %add.9 = add i32 %load10, %add.8 + + store i32 %add.9, i32 addrspace(1)* %addr, align 4 + ret void +} + +define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { +; GCN-LABEL: Offset64: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + + %add1 = add i64 %load2, %load1 + + %addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656 + %load3 = load i64, i64 addrspace(1)* %addr3, align 8 + + %add2 = add i64 %load3, %add1 + + %addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912 + %load4 = load i64, i64 addrspace(1)* %addr4, align 8 + %add4 = add i64 %load4, %add2 + + store i64 %add4, i64 addrspace(1)* %saddr, align 8 + ret void +} + +; TODO: Support load4 as anchor instruction. +define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) { +; GCN-LABEL: p32Offset64: +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)* + + %addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv + %load1 = load i32, i32 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400 + %load2 = load i32, i32 addrspace(1)* %addr2, align 8 + + %add1 = add i32 %load2, %load1 + + %addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656 + %load3 = load i32, i32 addrspace(1)* %addr3, align 8 + + %add2 = add i32 %load3, %add1 + + %addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912 + %load4 = load i32, i32 addrspace(1)* %addr4, align 8 + %add4 = add i32 %load4, %add2 + + store i32 %add4, i32 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, +; GCN-LABEL: DiffBase: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off + i8 addrspace(1)* %buffer2) { +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11 + %saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512 + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768 + %load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add1 = add i64 %load2, %load1 + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add2 = add i64 %load3, %add1 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add3 = add i64 %load5, %load4 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add4 = add i64 %load6, %add3 + + %add5 = add i64 %add2, %add4 + + store i64 %add5, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { +; GCN-LABEL: ReverseOrder: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) + %conv = and i64 %call, 255 + %a0 = shl i64 %call, 7 + %idx.ext11 = and i64 %a0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792 + %load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8 + %add7 = add i64 %load8, %load1 + + %add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536 + %load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8 + %add6 = add i64 %load7, %add7 + + %add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280 + %load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8 + %add5 = add i64 %load6, %add6 + + %add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024 + %load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8 + %add4 = add i64 %load5, %add5 + + %add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768 + %load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8 + %add3 = add i64 %load4, %add4 + + %add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512 + %load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8 + %add2 = add i64 %load3, %add3 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + %add1 = add i64 %load2, %add2 + + store i64 %add1, i64 addrspace(1)* %saddr, align 8 + ret void +} + +define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) { +; GCN-LABEL: negativeoffset: +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +entry: + %call = tail call i64 @_Z13get_global_idj(i32 0) #2 + %conv = and i64 %call, 255 + %0 = shl i64 %call, 7 + %idx.ext11 = and i64 %0, 4294934528 + %add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11 + %buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)* + + %buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv + + %addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656 + %load1 = load i64, i64 addrspace(1)* %addr1, align 8 + + %addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912 + %load2 = load i64, i64 addrspace(1)* %addr2, align 8 + + + %add = add i64 %load2, %load1 + + store i64 %add, i64 addrspace(1)* %buffer_head, align 8 + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir +++ llvm/trunk/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir @@ -0,0 +1,154 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s + +# GFX9-LABEL: name: diffoporder_add +# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, -2048, 0, 0 +# GFX9: %{{[0-9]+}}:vreg_64 = GLOBAL_LOAD_DWORDX2 %{{[0-9]+}}, 0, 0, 0 + +name: diffoporder_add +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 4096 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %25, %21, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 6144 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec +... +--- + +# GFX9-LABEL: name: LowestInMiddle +# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 11200 +# GFX9: [[BASE_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]] +# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE_LO]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -3200, 0, 0 +# +# GFX9: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 6400 +# GFX9: [[BASE1_LO:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_7:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_2]] +# GFX9: [[BASE1_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_7]] +# GFX9: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BASE1_LO]], %subreg.sub0, [[BASE1_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE3]], 0, 0, 0, +# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0, + +name: LowestInMiddle +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 8000 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 6400 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec + %39:sgpr_32 = S_MOV_B32 11200 + %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec + %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec + %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec +... +--- + +# GFX9-LABEL: name: NegativeDistance +# GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 10240 +# GFX9: [[V_ADD_I32_e64_4:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_5:%[0-9]+]]:sreg_64_xexec = V_ADD_I32_e64 %{{[0-9]+}}, [[S_MOV_B32_1]] +# GFX9: [[BASE_HI:%[0-9]+]]:vgpr_32, dead %{{[0-9]+}}:sreg_64_xexec = V_ADDC_U32_e64 %{{[0-9]+}}, 0, killed [[V_ADD_I32_e64_5]] +# GFX9: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_I32_e64_4]], %subreg.sub0, [[BASE_HI]], %subreg.sub1 +# GFX9: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -4096, 0, 0 +# GFX9: [[GLOBAL_LOAD_DWORDX2_1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], -2048, 0, 0 +# GFX9: [[GLOBAL_LOAD_DWORDX2_2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[REG_SEQUENCE2]], 0, 0, 0 + +name: NegativeDistance +body: | + bb.0.entry: + %0:sgpr_64 = COPY $sgpr0_sgpr1 + %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 36, 0 + %3:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %4:sreg_32_xm0 = COPY $sgpr101 + %5:sreg_32_xm0 = S_MOV_B32 0 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %3 + $sgpr4 = COPY %4 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + %6:vreg_64 = COPY $vgpr0_vgpr1 + %7:vgpr_32 = V_AND_B32_e32 255, %6.sub0, implicit $exec + %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %9:vreg_64 = REG_SEQUENCE killed %7, %subreg.sub0, %8, %subreg.sub1 + %10:vgpr_32 = V_LSHLREV_B32_e64 7, %6.sub0, implicit $exec + %11:vgpr_32 = V_AND_B32_e32 -32768, killed %10, implicit $exec + %12:sgpr_32 = COPY %1.sub1 + %13:vgpr_32 = COPY %5 + %14:vgpr_32, %15:sreg_64_xexec = V_ADD_I32_e64 %1.sub0, %11, implicit $exec + %16:vgpr_32 = COPY %12 + %17:vgpr_32, dead %18:sreg_64_xexec = V_ADDC_U32_e64 %16, %13, killed %15, implicit $exec + %19:vreg_64 = REG_SEQUENCE %14, %subreg.sub0, %17, %subreg.sub1 + %20:vreg_64 = V_LSHLREV_B64 3, %9, implicit $exec + %21:vgpr_32, %22:sreg_64_xexec = V_ADD_I32_e64 %14, %20.sub0, implicit $exec + %23:vgpr_32, dead %24:sreg_64_xexec = V_ADDC_U32_e64 %17, %20.sub1, killed %22, implicit $exec + %25:sgpr_32 = S_MOV_B32 6144 + %26:vgpr_32, %27:sreg_64_xexec = V_ADD_I32_e64 %21, %25, implicit $exec + %28:vgpr_32, dead %29:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %27, implicit $exec + %30:vreg_64 = REG_SEQUENCE %26, %subreg.sub0, %28, %subreg.sub1 + %31:vreg_64 = GLOBAL_LOAD_DWORDX2 %30, 0, 0, 0, implicit $exec + %32:sgpr_32 = S_MOV_B32 8192 + %33:vgpr_32, %34:sreg_64_xexec = V_ADD_I32_e64 %21, %32, implicit $exec + %35:vgpr_32, dead %36:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %34, implicit $exec + %37:vreg_64 = REG_SEQUENCE %33, %subreg.sub0, %35, %subreg.sub1 + %38:vreg_64 = GLOBAL_LOAD_DWORDX2 %37, 0, 0, 0, implicit $exec + %39:sgpr_32 = S_MOV_B32 10240 + %40:vgpr_32, %41:sreg_64_xexec = V_ADD_I32_e64 %21, %39, implicit $exec + %42:vgpr_32, dead %43:sreg_64_xexec = V_ADDC_U32_e64 %23, 0, killed %41, implicit $exec + %44:vreg_64 = REG_SEQUENCE %40, %subreg.sub0, %42, %subreg.sub1 + %45:vreg_64 = GLOBAL_LOAD_DWORDX2 %44, 0, 0, 0, implicit $exec +...