Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIFixupVectorISelPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -122,6 +123,9 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSIFixupVectorISelPass(PassRegistry &); +extern char &SIFixupVectorISelID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -161,6 +161,7 @@ initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); + initializeSIFixupVectorISelPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); @@ -813,6 +814,7 @@ AMDGPUPassConfig::addInstSelector(); addPass(&SIFixSGPRCopiesID); addPass(createSILowerI1CopiesPass()); + addPass(createSIFixupVectorISelPass()); return false; } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -96,6 +96,7 @@ SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp + SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIFixWWMLiveness.cpp SIFoldOperands.cpp Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -121,6 +121,11 @@ let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } +class GlobalSaddrTable { + bit IsSaddr = is_saddr; + string SaddrOp = Name; +} + // TODO: Is exec allowed for saddr? The disabled value 0x7f is the // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). @@ -171,15 +176,19 @@ multiclass FLAT_Global_Load_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo; - def _SADDR : FLAT_Load_Pseudo; + def "" : FLAT_Load_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Load_Pseudo, + GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo; - def _SADDR : FLAT_Store_Pseudo; + def "" : FLAT_Store_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Store_Pseudo, + GlobalSaddrTable<1, opName>; } } @@ -262,6 +271,7 @@ (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), " $vaddr, $vdata$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet { let PseudoInstr = NAME; } @@ -272,6 +282,7 @@ " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet ; } @@ -287,6 +298,7 @@ (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet { let has_saddr = 1; let PseudoInstr = NAME; @@ -296,6 +308,7 @@ (outs), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, + GlobalSaddrTable<1, opName>, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; @@ -317,6 +330,7 @@ " $vdst, $vaddr, $vdata, off$offset glc$slc", [(set vt:$vdst, (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; } @@ -325,6 +339,7 @@ (outs vdst_rc:$vdst), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, + GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; Index: lib/Target/AMDGPU/SIFixupVectorISel.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -0,0 +1,224 @@ +//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// SIFixupVectorISel pass cleans up post ISEL Vector issues. +/// Currently this will convert GLOBAL_{LOAD|STORE}_* +/// and GLOBAL_Atomic_* instructions into their _SADDR variants, +/// feeding the sreg into the saddr field of the new instruction. +/// We currently handle a REG_SEQUENCE feeding the vaddr +/// and decompose it into a base and index. +/// +/// Transform: +/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32 +/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32, +/// %24:vgpr_32, %19:sreg_64_xexec +/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1 +/// %11:vreg_64 = COPY %16:vreg_64 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0 +/// Into: +/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0 +/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16... +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-fixup-vector-isel" + +using namespace llvm; + +STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities"); +STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted"); + +namespace { + +class SIFixupVectorISel : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixupVectorISel() : MachineFunctionPass(ID) { + initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) + +char SIFixupVectorISel::ID = 0; + +char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; + +FunctionPass *llvm::createSIFixupVectorISelPass() { + return new SIFixupVectorISel(); +} + +static bool findSRegBaseAndIndex(MachineOperand *Op, + unsigned &BaseReg, + unsigned &IndexReg, + MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + SmallVector Worklist; + Worklist.push_back(Op); + while (!Worklist.empty()) { + MachineOperand *WOp = Worklist.pop_back_val(); + if (!WOp->isReg() || + !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + continue; + MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); + switch (DefInst->getOpcode()) { + default: + continue; + case AMDGPU::COPY: + Worklist.push_back(&DefInst->getOperand(1)); + break; + case AMDGPU::REG_SEQUENCE: + if (DefInst->getNumOperands() != 5) + continue; + Worklist.push_back(&DefInst->getOperand(1)); + Worklist.push_back(&DefInst->getOperand(3)); + break; + case AMDGPU::V_ADD_I32_e64: + // The V_ADD_* and its analogous V_ADDCV_* are generated by + // a previous pass which lowered from an ADD_64_PSEUDO, + // which generates subregs to break up the 64 bit args. + if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister) + continue; + BaseReg = DefInst->getOperand(2).getReg(); + if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister) + continue; + IndexReg = DefInst->getOperand(3).getReg(); + // Chase the IndexReg. + MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the reg class is 64 bit for Index. + // If the Index register is a subreg, we want it to reference + // a 64 bit register which we will use as the Index reg. + const TargetRegisterClass *IdxRC, *BaseRC; + IdxRC = MRI.getRegClass(MI->getOperand(1).getReg()); + if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64) + continue; + IndexReg = MI->getOperand(1).getReg(); + // Chase the BaseReg. + MI = MRI.getUniqueVRegDef(BaseReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the register class is 64 bit for Base. + BaseReg = MI->getOperand(1).getReg(); + BaseRC = MRI.getRegClass(BaseReg); + if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64) + continue; + // Make sure Base is SReg and Index is VReg. + if (!TRI->isSGPRReg(MRI, BaseReg)) + return false; + if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg))) + return false; + // clear any killed flags on Index and Base regs, used later. + MRI.clearKillFlags(IndexReg); + MRI.clearKillFlags(BaseReg); + return true; + } + } + return false; +} + +// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR. +static bool fixupGlobalSaddr(MachineBasicBlock &MBB, + MachineFunction &MF, + MachineRegisterInfo &MRI, + const GCNSubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI) { + bool FuncModified = false; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode()); + if (NewOpcd < 0) + continue; + // Update our statistics on opportunities seen. + ++NumSGPRGlobalOccurs; + LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n'); + // Need a Base and Index or we cant transform to _SADDR. + unsigned BaseReg = 0; + unsigned IndexReg = 0; + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI)) + continue; + ++NumSGPRGlobalSaddrs; + FuncModified = true; + // Create the new _SADDR Memory instruction. + bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr; + MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineInstr *NewGlob = nullptr; + NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd)); + if (HasVdst) + NewGlob->addOperand(MF, MI.getOperand(0)); + NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false)); + if (VData) + NewGlob->addOperand(MF, *VData); + NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false)); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset)); + + MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc); + // Atomics dont have a GLC, so omit the field if not there. + if (Glc) + NewGlob->addOperand(MF, *Glc); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); + // _D16 have an vdst_in operand, copy it in. + MachineOperand *VDstInOp = TII->getNamedOperand(MI, + AMDGPU::OpName::vdst_in); + if (VDstInOp) + NewGlob->addOperand(MF, *VDstInOp); + NewGlob->copyImplicitOps(MF, MI); + NewGlob->cloneMemRefs(MF, MI); + // Remove the old Global Memop instruction. + MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n'); + } + return FuncModified; +} + +bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool FuncModified = false; + for (MachineBasicBlock &MBB : MF) { + // Cleanup missed Saddr opportunites from ISel. + FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI); + } + return FuncModified; +} Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -954,6 +954,9 @@ LLVM_READONLY int getSOPKOp(uint16_t Opcode); + LLVM_READONLY + int getGlobalSaddrOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2017,6 +2017,15 @@ let ValueCols = [["0"]]; } +// Maps a GLOBAL to its SADDR form. +def getGlobalSaddrOp : InstrMapping { + let FilterClass = "GlobalSaddrTable"; + let RowFields = ["SaddrOp"]; + let ColFields = ["IsSaddr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -754,6 +754,7 @@ case AMDGPU::VS_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: + case AMDGPU::SReg_64_XEXECRegClassID: return 64; case AMDGPU::VReg_96RegClassID: return 96; Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -31,8 +31,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 @@ -177,8 +177,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 @@ -362,8 +362,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -30,8 +30,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -59,8 +59,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] @@ -87,8 +87,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] Index: test/CodeGen/AMDGPU/global-load-store-atomics.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-load-store-atomics.mir @@ -0,0 +1,249 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel %s -o - | FileCheck -check-prefix=GCN %s + +# Coverage tests for GLOBAL_* to their _SADDR equivalent. + +# GCN-LABEL: name: global_load_store_atomics +# GCN: GLOBAL_LOAD_DWORD_SADDR +# GCN: GLOBAL_STORE_DWORD_SADDR +# GCN: GLOBAL_LOAD_DWORDX2_SADDR +# GCN: GLOBAL_STORE_DWORDX2_SADDR +# GCN: GLOBAL_LOAD_DWORDX3_SADDR +# GCN: GLOBAL_STORE_DWORDX3_SADDR +# GCN: GLOBAL_LOAD_DWORDX4_SADDR +# GCN: GLOBAL_STORE_DWORDX4_SADDR +# GCN: GLOBAL_LOAD_SSHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_USHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_UBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR + +# GCN: GLOBAL_ATOMIC_XOR_SADDR_RTN +# GCN: GLOBAL_ATOMIC_XOR_SADDR % +# GCN: GLOBAL_ATOMIC_SMIN_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMIN_SADDR % +# GCN: GLOBAL_ATOMIC_AND_SADDR_RTN +# GCN: GLOBAL_ATOMIC_AND_SADDR % +# GCN: GLOBAL_ATOMIC_SWAP_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SWAP_SADDR % +# GCN: GLOBAL_ATOMIC_SMAX_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMAX_SADDR % +# GCN: GLOBAL_ATOMIC_UMIN_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMIN_SADDR % +# GCN: GLOBAL_ATOMIC_UMAX_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMAX_SADDR % +# GCN: GLOBAL_ATOMIC_OR_SADDR_RTN +# GCN: GLOBAL_ATOMIC_OR_SADDR % +# GCN: GLOBAL_ATOMIC_ADD_SADDR_RTN +# GCN: GLOBAL_ATOMIC_ADD_SADDR % +# GCN: GLOBAL_ATOMIC_SUB_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SUB_SADDR % +# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR_RTN +# GCN: GLOBAL_ATOMIC_CMPSWAP_SADDR % +# GCN: GLOBAL_ATOMIC_INC_SADDR_RTN +# GCN: GLOBAL_ATOMIC_INC_SADDR % +# GCN: GLOBAL_ATOMIC_DEC_SADDR_RTN +# GCN: GLOBAL_ATOMIC_DEC_SADDR % + +# GCN: GLOBAL_ATOMIC_OR_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_OR_X2_SADDR % +# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_XOR_X2_SADDR % +# GCN: GLOBAL_ATOMIC_AND_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_AND_X2_SADDR % +# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_ADD_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SUB_X2_SADDR % +# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_DEC_X2_SADDR % +# GCN: GLOBAL_ATOMIC_INC_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_INC_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMIN_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SWAP_X2_SADDR % +# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_SMAX_X2_SADDR % +# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMIN_X2_SADDR % +# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_UMAX_X2_SADDR % +# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR_RTN +# GCN: GLOBAL_ATOMIC_CMPSWAP_X2_SADDR % + +name: global_load_store_atomics +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 ) + %5:sreg_32_xm0 = S_MOV_B32 2 + %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec + %7:sreg_32_xm0 = S_MOV_B32 0 + %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1 + %21:sgpr_32 = COPY %4.sub0 + %22:vgpr_32 = COPY %14.sub0 + %23:sgpr_32 = COPY %4.sub1 + %24:vgpr_32 = COPY %14.sub1 + %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec + %25:vgpr_32 = COPY %23 + %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec + %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1 + %11:vreg_64 = COPY %16 + + %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + + %53:vgpr_32 = GLOBAL_ATOMIC_XOR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %53, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_XOR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %54:vgpr_32 = GLOBAL_ATOMIC_SMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %54, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %55:vgpr_32 = GLOBAL_ATOMIC_AND_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %55, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_AND %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %56:vgpr_32 = GLOBAL_ATOMIC_SWAP_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %56, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SWAP %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %57:vgpr_32 = GLOBAL_ATOMIC_SMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %57, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %58:vgpr_32 = GLOBAL_ATOMIC_UMIN_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %58, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMIN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %59:vgpr_32 = GLOBAL_ATOMIC_UMAX_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %59, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMAX %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %60:vgpr_32 = GLOBAL_ATOMIC_OR_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %60, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_OR %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %61:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %61, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_ADD %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %62:vgpr_32 = GLOBAL_ATOMIC_SUB_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %62, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SUB %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %63:vgpr_32 = GLOBAL_ATOMIC_CMPSWAP_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %63, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_CMPSWAP %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %64:vgpr_32 = GLOBAL_ATOMIC_INC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %64, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_INC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %65:vgpr_32 = GLOBAL_ATOMIC_DEC_RTN %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORD %11, %65, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_DEC %11, %15, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %66:vreg_64 = GLOBAL_ATOMIC_OR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %66, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_OR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %67:vreg_64 = GLOBAL_ATOMIC_XOR_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %67, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_XOR_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %68:vreg_64 = GLOBAL_ATOMIC_AND_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %68, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_AND_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %69:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %69, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_ADD_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %70:vreg_64 = GLOBAL_ATOMIC_SUB_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %70, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SUB_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %71:vreg_64 = GLOBAL_ATOMIC_DEC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %71, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_DEC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %72:vreg_64 = GLOBAL_ATOMIC_INC_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %72, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_INC_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %73:vreg_64 = GLOBAL_ATOMIC_SMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %73, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %74:vreg_64 = GLOBAL_ATOMIC_SWAP_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %74, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SWAP_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %75:vreg_64 = GLOBAL_ATOMIC_SMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %75, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_SMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %76:vreg_64 = GLOBAL_ATOMIC_UMIN_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %76, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMIN_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %77:vreg_64 = GLOBAL_ATOMIC_UMAX_X2_RTN %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %77, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_UMAX_X2 %11, %16, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + %79:sreg_128 = REG_SEQUENCE %4, %subreg.sub0, %4, %subreg.sub1, %4, %subreg.sub2, %4, %subreg.sub3 + %80:vreg_128 = COPY %79 + + %78:vreg_64 = GLOBAL_ATOMIC_CMPSWAP_X2_RTN %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + GLOBAL_STORE_DWORDX2 %11, %78, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + GLOBAL_ATOMIC_CMPSWAP_X2 %11, %80, 16, 0, implicit $exec :: (volatile load store seq_cst 4, addrspace 1) + + S_ENDPGM +... Index: test/CodeGen/AMDGPU/global-saddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-saddr.ll @@ -0,0 +1,102 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s + +; Test for a conv2d like sequence of loads. + +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} + +define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep = getelementptr i64, i64 addrspace(1)* %src_image, i64 %idx + %ptr0 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1 + %load0 = load i64, i64 addrspace(1)* %ptr0 + %ptr1 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 2 + %load1 = load i64, i64 addrspace(1)* %ptr1 + %ptr2 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 3 + %load2 = load i64, i64 addrspace(1)* %ptr2 + %ptr3 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 4 + %load3 = load i64, i64 addrspace(1)* %ptr3 + %ptr4 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -4 + %load4 = load i64, i64 addrspace(1)* %ptr4 + %ptr5 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -3 + %load5 = load i64, i64 addrspace(1)* %ptr5 + %ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2 + %load6 = load i64, i64 addrspace(1)* %ptr6 + %ptr7 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1 + %load7 = load i64, i64 addrspace(1)* %ptr7 + %ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 0 + %load8 = load i64, i64 addrspace(1)* %ptr8 + %add0 = add i64 %load1, %load0 + %add1 = add i64 %load3, %load2 + %add2 = add i64 %load5, %load4 + %add3 = add i64 %load7, %load6 + %add4 = add i64 %add0, %load8 + %add5 = add i64 %add2, %add1 + %add6 = add i64 %add4, %add3 + %add7 = add i64 %add6, %add5 + %gep9 = getelementptr i64, i64 addrspace(1)* %dst_image, i64 %idx + %ptr9 = getelementptr inbounds i64, i64 addrspace(1)* %gep9, i64 1 + store volatile i64 %add7, i64 addrspace(1)* %ptr9 + +; Test various offset boundaries. +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} + %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 + %load11 = load i64, i64 addrspace(1)* %gep11 + %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 + %load12 = load i64, i64 addrspace(1)* %gep12 + %gep13 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 + %load13 = load i64, i64 addrspace(1)* %gep13 + %add11 = add i64 %load11, %load12 + %add12 = add i64 %add11, %load13 + store volatile i64 %add12, i64 addrspace(1)* undef + +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} + %gep21 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -1024 + %load21 = load i64, i64 addrspace(1)* %gep21 + %gep22 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -2048 + %load22 = load i64, i64 addrspace(1)* %gep22 + %gep23 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 -512 + %load23 = load i64, i64 addrspace(1)* %gep23 + %add21 = add i64 %load22, %load21 + %add22 = add i64 %add21, %load23 + store volatile i64 %add22, i64 addrspace(1)* undef + +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} + %gep31 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 257 + %load31 = load i64, i64 addrspace(1)* %gep31 + %gep32 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 256 + %load32 = load i64, i64 addrspace(1)* %gep32 + %gep33 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 255 + %load33 = load i64, i64 addrspace(1)* %gep33 + %add34 = add i64 %load32, %load31 + %add35 = add i64 %add34, %load33 + store volatile i64 %add35, i64 addrspace(1)* undef + ret void +} + +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16{{$}} +; GFX9-NEXT: s_waitcnt +; NGFX9-NOT: global_load_dword + +define amdgpu_cs void @_amdgpu_cs_main(i64 inreg %arg) { +bb: + %tmp1 = inttoptr i64 %arg to <4 x i64> addrspace(1)* + %tmp2 = load <4 x i64>, <4 x i64> addrspace(1)* %tmp1, align 16 + store volatile <4 x i64> %tmp2, <4 x i64> addrspace(1)* undef + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll @@ -0,0 +1,85 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s + +; indexing of vectors. + +; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll +; to avoid gfx9 scheduling induced issues. + + +; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: +; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] +; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 + +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:3]], s[[S_ELT0]] + +; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] + +; GCN: s_mov_b64 [[MASK]], exec + +; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63 +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP1]] + +; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: + +; GCN: buffer_store_dword [[INS0]] +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 + store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.val, i32 addrspace(1)* undef + br label %bb2 + +bb2: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll @@ -0,0 +1,88 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s + +; Tests for indirect addressing on SI, which is implemented using dynamic +; indexing of vectors. + +; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll +; to avoid gfx9 scheduling induced issues. + + +; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: +; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} +; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] +; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 + +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] + +; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP0]] + +; FIXME: Redundant copy +; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] + +; GCN: s_mov_b64 [[MASK]], exec + +; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] +; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] +; GCN: s_and_saveexec_b64 vcc, vcc + +; MOVREL: s_mov_b32 m0, [[READLANE]] +; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 + +; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst +; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63 +; IDXMODE: s_set_gpr_idx_off + +; GCN-NEXT: s_xor_b64 exec, exec, vcc +; GCN: s_cbranch_execnz [[LOOP1]] + +; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: + +; GCN: buffer_store_dword [[INS0]] +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() #1 + %id.ext = zext i32 %id to i64 + %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext + %idx0 = load volatile i32, i32 addrspace(1)* %gep + %idx1 = add i32 %idx0, 1 + %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() + %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 + store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 + %cmp = icmp eq i32 %id, 0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + store volatile i32 %live.out.val, i32 addrspace(1)* undef + br label %bb2 + +bb2: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #2 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -392,76 +392,9 @@ ret void } -; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} -; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] -; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 - -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] - -; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]] - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]] -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP0]] - -; FIXME: Redundant copy -; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]] - -; GCN: s_mov_b64 [[MASK]], exec - -; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]: -; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]] -; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]] -; GCN: s_and_saveexec_b64 vcc, vcc - -; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 - -; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63 -; IDXMODE: s_set_gpr_idx_off - -; GCN-NEXT: s_xor_b64 exec, exec, vcc -; GCN: s_cbranch_execnz [[LOOP1]] - -; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: - -; GCN: buffer_store_dword [[INS0]] -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { -entry: - %id = call i32 @llvm.amdgcn.workitem.id.x() #1 - %id.ext = zext i32 %id to i64 - %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext - %idx0 = load volatile i32, i32 addrspace(1)* %gep - %idx1 = add i32 %idx0, 1 - %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 - store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 - %cmp = icmp eq i32 %id, 0 - br i1 %cmp, label %bb1, label %bb2 - -bb1: - store volatile i32 %live.out.val, i32 addrspace(1)* undef - br label %bb2 - -bb2: - ret void -} +; Moved subtest for insert_vgpr_offset_multiple_in_block to separate file to +; avoid very different schedule induced isses with gfx9. +; test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll ; GCN-LABEL: {{^}}insert_adjacent_blocks: Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -444,40 +444,12 @@ ret void } -; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 - -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] - -; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] - -; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] - -; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 - %tid.ext = sext i32 %tid to i64 - %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext - %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext - %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext - %idx = load i32, i32 addrspace(1)* %idx.gep - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep - %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx - store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep - ret void -} - ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -8,8 +8,10 @@ ; GCN-LABEL: {{^}}madak_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -88,8 +90,10 @@ ; GCN-LABEL: {{^}}madak_inline_imm_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -319,7 +319,7 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32 addrspace(1)* %in, i32* %out) { entry: Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -240,7 +240,7 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32* %in, i32 addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- test/CodeGen/AMDGPU/memory_clause.ll +++ test/CodeGen/AMDGPU/memory_clause.ll @@ -105,7 +105,7 @@ } ; GCN-LABEL: {{^}}vector_clause_indirect: -; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off +; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -265,16 +265,15 @@ ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20 - -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64