Index: llvm/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.h +++ llvm/lib/Target/AMDGPU/AMDGPU.h @@ -52,7 +52,6 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); -FunctionPass *createSIFixupVectorISelPass(); FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); @@ -148,9 +147,6 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; -void initializeSIFixupVectorISelPass(PassRegistry &); -extern char &SIFixupVectorISelID; - void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -216,7 +216,6 @@ initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); - initializeSIFixupVectorISelPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); @@ -936,13 +935,6 @@ AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); - // TODO: We have to add FinalizeISel - // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel - // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded. - // Will be removed as soon as SIFixupVectorISel is changed - // to work with V_ADD/SUB_U64_PSEUDO instead. - addPass(&FinalizeISelID); - addPass(createSIFixupVectorISelPass()); addPass(createSIAddIMGInitPass()); return false; } Index: llvm/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -109,7 +109,6 @@ SIAddIMGInit.cpp SIAnnotateControlFlow.cpp SIFixSGPRCopies.cpp - SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIPreAllocateWWMRegs.cpp SIFoldOperands.cpp Index: llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp +++ /dev/null @@ -1,239 +0,0 @@ -//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -/// \file -/// SIFixupVectorISel pass cleans up post ISEL Vector issues. -/// Currently this will convert GLOBAL_{LOAD|STORE}_* -/// and GLOBAL_Atomic_* instructions into their _SADDR variants, -/// feeding the sreg into the saddr field of the new instruction. -/// We currently handle a REG_SEQUENCE feeding the vaddr -/// and decompose it into a base and index. -/// -/// Transform: -/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_CO_U32_e64 %21:sgpr_32, %22:vgpr_32 -/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32, -/// %24:vgpr_32, %19:sreg_64_xexec -/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1 -/// %11:vreg_64 = COPY %16:vreg_64 -/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0 -/// Into: -/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0 -/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1 -/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16... -/// -//===----------------------------------------------------------------------===// -// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" -#define DEBUG_TYPE "si-fixup-vector-isel" - -using namespace llvm; - -static cl::opt EnableGlobalSGPRAddr( - "amdgpu-enable-global-sgpr-addr", - cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"), - cl::init(false)); - -STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities"); -STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted"); - -namespace { - -class SIFixupVectorISel : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixupVectorISel() : MachineFunctionPass(ID) { - initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE, - "SI Fixup Vector ISel", false, false) - -char SIFixupVectorISel::ID = 0; - -char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; - -FunctionPass *llvm::createSIFixupVectorISelPass() { - return new SIFixupVectorISel(); -} - -static bool findSRegBaseAndIndex(MachineOperand *Op, - unsigned &BaseReg, - unsigned &IndexReg, - MachineRegisterInfo &MRI, - const SIRegisterInfo *TRI) { - SmallVector Worklist; - Worklist.push_back(Op); - while (!Worklist.empty()) { - MachineOperand *WOp = Worklist.pop_back_val(); - if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg())) - continue; - MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); - switch (DefInst->getOpcode()) { - default: - continue; - case AMDGPU::COPY: - Worklist.push_back(&DefInst->getOperand(1)); - break; - case AMDGPU::REG_SEQUENCE: - if (DefInst->getNumOperands() != 5) - continue; - Worklist.push_back(&DefInst->getOperand(1)); - Worklist.push_back(&DefInst->getOperand(3)); - break; - case AMDGPU::V_ADD_CO_U32_e64: - // The V_ADD_* and its analogous V_ADDCV_* are generated by - // a previous pass which lowered from an ADD_64_PSEUDO, - // which generates subregs to break up the 64 bit args. - if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister) - continue; - BaseReg = DefInst->getOperand(2).getReg(); - if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister) - continue; - IndexReg = DefInst->getOperand(3).getReg(); - // Chase the IndexReg. - MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg); - if (!MI || !MI->isCopy()) - continue; - // Make sure the reg class is 64 bit for Index. - // If the Index register is a subreg, we want it to reference - // a 64 bit register which we will use as the Index reg. - const TargetRegisterClass *IdxRC, *BaseRC; - IdxRC = MRI.getRegClass(MI->getOperand(1).getReg()); - if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64) - continue; - IndexReg = MI->getOperand(1).getReg(); - // Chase the BaseReg. - MI = MRI.getUniqueVRegDef(BaseReg); - if (!MI || !MI->isCopy()) - continue; - // Make sure the register class is 64 bit for Base. - BaseReg = MI->getOperand(1).getReg(); - BaseRC = MRI.getRegClass(BaseReg); - if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64) - continue; - // Make sure Base is SReg and Index is VReg. - if (!TRI->isSGPRReg(MRI, BaseReg)) - return false; - if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg))) - return false; - // clear any killed flags on Index and Base regs, used later. - MRI.clearKillFlags(IndexReg); - MRI.clearKillFlags(BaseReg); - return true; - } - } - return false; -} - -// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR. -static bool fixupGlobalSaddr(MachineBasicBlock &MBB, - MachineFunction &MF, - MachineRegisterInfo &MRI, - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI) { - if (!EnableGlobalSGPRAddr) - return false; - bool FuncModified = false; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; - int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode()); - if (NewOpcd < 0) - continue; - // Update our statistics on opportunities seen. - ++NumSGPRGlobalOccurs; - LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n'); - // Need a Base and Index or we cant transform to _SADDR. - unsigned BaseReg = 0; - unsigned IndexReg = 0; - MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); - if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI)) - continue; - ++NumSGPRGlobalSaddrs; - FuncModified = true; - // Create the new _SADDR Memory instruction. - bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr; - MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); - MachineInstr *NewGlob = nullptr; - NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd)); - if (HasVdst) - NewGlob->addOperand(MF, MI.getOperand(0)); - NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false)); - if (VData) - NewGlob->addOperand(MF, *VData); - NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false)); - NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset)); - - MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc); - // Atomics dont have a GLC, so omit the field if not there. - if (Glc) - NewGlob->addOperand(MF, *Glc); - - MachineOperand *DLC = TII->getNamedOperand(MI, AMDGPU::OpName::dlc); - if (DLC) - NewGlob->addOperand(MF, *DLC); - - NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); - // _D16 have an vdst_in operand, copy it in. - MachineOperand *VDstInOp = TII->getNamedOperand(MI, - AMDGPU::OpName::vdst_in); - if (VDstInOp) - NewGlob->addOperand(MF, *VDstInOp); - NewGlob->copyImplicitOps(MF, MI); - NewGlob->cloneMemRefs(MF, MI); - // Remove the old Global Memop instruction. - MI.eraseFromParent(); - LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n'); - } - return FuncModified; -} - -bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { - // Only need to run this in SelectionDAG path. - if (MF.getProperties().hasProperty( - MachineFunctionProperties::Property::Selected)) - return false; - - if (skipFunction(MF.getFunction())) - return false; - - MachineRegisterInfo &MRI = MF.getRegInfo(); - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - - bool FuncModified = false; - for (MachineBasicBlock &MBB : MF) { - // Cleanup missed Saddr opportunites from ISel. - FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI); - } - return FuncModified; -} Index: llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll +++ llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}name:{{[ ]*}}vector_clause ; GCN: BUNDLE Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -9,7 +9,8 @@ ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|flat|global}}_load_dword [[VAL:v[0-9]+]] -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}} +; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]] ; GCN: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { @@ -31,10 +32,11 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}} +; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]] ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { @@ -194,10 +196,12 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 + +; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}} +; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]] -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { @@ -379,11 +383,11 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 - +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 3, v{{[0-9]+}} +; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VBASE]] ; GCN: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { Index: llvm/test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2st64.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -1,11 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s @lds = addrspace(3) global [512 x float] undef, align 4 ; GCN-LABEL: @simple_write2st64_one_val_f32_0_1 ; CI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0n +; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dword [[VAL:v[0-9]+]] ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -30,8 +30,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -59,8 +59,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] @@ -87,8 +87,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] Index: llvm/test/CodeGen/AMDGPU/global-load-store-atomics.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/global-load-store-atomics.mir +++ /dev/null @@ -1,249 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel -amdgpu-enable-global-sgpr-addr %s -o - | FileCheck -check-prefix=GCN %s - -# Coverage tests for GLOBAL_* to their _SADDR equivalent. - -# GCN-LABEL: name: global_load_store_atomics -# GCN: GLOBAL_LOAD_DWORD_SADDR -# GCN: GLOBAL_STORE_DWORD_SADDR -# GCN: GLOBAL_LOAD_DWORDX2_SADDR -# GCN: GLOBAL_STORE_DWORDX2_SADDR -# GCN: GLOBAL_LOAD_DWORDX3_SADDR -# GCN: GLOBAL_STORE_DWORDX3_SADDR -# GCN: GLOBAL_LOAD_DWORDX4_SADDR -# GCN: GLOBAL_STORE_DWORDX4_SADDR -# GCN: GLOBAL_LOAD_SSHORT_SADDR -# GCN: GLOBAL_STORE_SHORT_SADDR -# GCN: GLOBAL_LOAD_USHORT_SADDR -# GCN: GLOBAL_STORE_SHORT_SADDR -# GCN: GLOBAL_LOAD_UBYTE_SADDR -# GCN: GLOBAL_STORE_BYTE_SADDR -# GCN: GLOBAL_LOAD_SBYTE_SADDR -# GCN: GLOBAL_STORE_BYTE_SADDR -# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR -# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR -# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR -# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR -# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR -# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR -# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR -# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR -# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR -# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR -# GCN: GLOBAL_LOAD_SHORT_D16_SADDR -# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR - -# GCN: GLOBAL_ATOMIC_XOR_SADDR_RTN -# GCN: GLOBAL_ATOMIC_XOR_SADDR % -# GCN: GLOBAL_ATOMIC_SMIN_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SMIN_SADDR % -# GCN: GLOBAL_ATOMIC_AND_SADDR_RTN -# GCN: GLOBAL_ATOMIC_AND_SADDR % -# GCN: GLOBAL_ATOMIC_SWAP_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SWAP_SADDR % -# GCN: GLOBAL_ATOMIC_SMAX_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SMAX_SADDR % -# GCN: GLOBAL_ATOMIC_UMIN_SADDR_RTN -# GCN: GLOBAL_ATOMIC_UMIN_SADDR % -# GCN: GLOBAL_ATOMIC_UMAX_SADDR_RTN -# GCN: GLOBAL_ATOMIC_UMAX_SADDR % -# GCN: GLOBAL_ATOMIC_OR_SADDR_RTN -# GCN: GLOBAL_ATOMIC_OR_SADDR % -# GCN: GLOBAL_ATOMIC_ADD_SADDR_RTN -# GCN: GLOBAL_ATOMIC_ADD_SADDR % -# GCN: GLOBAL_ATOMIC_SUB_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SUB_SADDR % -# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN -# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR % -# GCN: GLOBAL_ATOMIC_INC_SADDR_RTN -# GCN: GLOBAL_ATOMIC_INC_SADDR % -# GCN: GLOBAL_ATOMIC_DEC_SADDR_RTN -# GCN: GLOBAL_ATOMIC_DEC_SADDR % - -# GCN: GLOBAL_ATOMIC_OR_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_OR_X2_SADDR % -# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR % -# GCN: GLOBAL_ATOMIC_AND_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_AND_X2_SADDR % -# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR % -# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR % -# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR % -# GCN: GLOBAL_ATOMIC_INC_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_INC_X2_SADDR % -# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR % -# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR % -# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR % -# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR % -# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR % -# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN -# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR % - -name: global_load_store_atomics -body: | - bb.0: - liveins: $vgpr0, $sgpr0_sgpr1 - - %1:sgpr_64 = COPY $sgpr0_sgpr1 - %0:vgpr_32 = COPY $vgpr0 - %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0, 0 :: (dereferenceable invariant load 8 ) - %5:sreg_32_xm0 = S_MOV_B32 2 - %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec - %7:sreg_32_xm0 = S_MOV_B32 0 - %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1 - %21:sgpr_32 = COPY %4.sub0 - %22:vgpr_32 = COPY %14.sub0 - %23:sgpr_32 = COPY %4.sub1 - %24:vgpr_32 = COPY %14.sub1 - %17:vgpr_32, %19:sreg_64_xexec = V_ADD_CO_U32_e64 %21, %22, 0, implicit $exec - %25:vgpr_32 = COPY %23 - %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, 0, implicit $exec - %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1 - %11:vreg_64 = COPY %16 - - %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, 0, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, 0, %46, implicit $exec :: (load 4) - GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - - %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - %79:sgpr_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3 - %80:vreg_128 = COPY %79 - - %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) - GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) - - S_ENDPGM 0 -... Index: llvm/test/CodeGen/AMDGPU/global-saddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ /dev/null @@ -1,103 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GFX9 %s - -; Test for a conv2d like sequence of loads. - -; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} -; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} -; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} -; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} -; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} - -define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() - %idx = zext i32 %id to i64 - %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx - %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1 - %load0 = load i64, i64 addrspace(1)* %ptr0 - %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2 - %load1 = load i64, i64 addrspace(1)* %ptr1 - %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3 - %load2 = load i64, i64 addrspace(1)* %ptr2 - %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4 - %load3 = load i64, i64 addrspace(1)* %ptr3 - %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4 - %load4 = load i64, i64 addrspace(1)* %ptr4 - %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3 - %load5 = load i64, i64 addrspace(1)* %ptr5 - %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2 - %load6 = load i64, i64 addrspace(1)* %ptr6 - %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1 - %load7 = load i64, i64 addrspace(1)* %ptr7 - %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0 - %load8 = load i64, i64 addrspace(1)* %ptr8 - %add0 = add i64 %load1, %load0 - %add1 = add i64 %load3, %load2 - %add2 = add i64 %load5, %load4 - %add3 = add i64 %load7, %load6 - %add4 = add i64 %add0, %load8 - %add5 = add i64 %add2, %add1 - %add6 = add i64 %add4, %add3 - %add7 = add i64 %add6, %add5 - %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx - %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1 - store volatile i64 %add7, i64 addrspace(1)* %ptr9 - -; Test various offset boundaries. -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} - %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 - %load11 = load i64, i64 addrspace(1)* %gep11 - %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 - %load12 = load i64, i64 addrspace(1)* %gep12 - %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 - %load13 = load i64, i64 addrspace(1)* %gep13 - %add11 = add i64 %load11, %load12 - %add12 = add i64 %add11, %load13 - store volatile i64 %add12, i64 addrspace(1)* undef - -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} - %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024 - %load21 = load i64, i64 addrspace(1)* %gep21 - %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048 - %load22 = load i64, i64 addrspace(1)* %gep22 - %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512 - %load23 = load i64, i64 addrspace(1)* %gep23 - %add21 = add i64 %load22, %load21 - %add22 = add i64 %add21, %load23 - store volatile i64 %add22, i64 addrspace(1)* undef - -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} - %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257 - %load31 = load i64, i64 addrspace(1)* %gep31 - %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256 - %load32 = load i64, i64 addrspace(1)* %gep32 - %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 - %load33 = load i64, i64 addrspace(1)* %gep33 - %add34 = add i64 %load32, %load31 - %add35 = add i64 %add34, %load33 - store volatile i64 %add35, i64 addrspace(1)* undef - ret void -} - -; GFX9-LABEL: {{^}}_amdgpu_cs_main: -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} -; GFX9-NEXT: s_waitcnt -; GFX9-NOT: global_load_dword - -define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) { -bb: - %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)* - %tmp2 = load volatile <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16 - store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef - ret void -} - -declare i32 @llvm.amdgcn.workitem.id.x() #1 -attributes #0 = { convergent nounwind } -attributes #1 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -amdgpu-enable-global-sgpr-addr -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: Index: llvm/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madak.ll +++ llvm/test/CodeGen/AMDGPU/madak.ll @@ -1,8 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -494,8 +494,8 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_global_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -363,8 +363,8 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_global_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -1,27 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[16:17], s[2:3] offset:48 -; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s2, v18 +; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v0, vcc +; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], off +; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48 +; GCN-NEXT: v_mov_b32_e32 v17, s5 +; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s4, v18 +; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], s[4:5] offset:32 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], s[4:5] offset:48 +; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -189,10 +193,12 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], off ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off Index: llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,5 +1,5 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-global-sgpr-addr < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 @stored_constant_ptr = addrspace(3) global i32 addrspace(4)* undef, align 8 @@ -257,14 +257,14 @@ ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x()