Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIFixupVectorISelPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -118,6 +119,9 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSIFixupVectorISelPass(PassRegistry &); +extern char &SIFixupVectorISelID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -161,6 +161,7 @@ initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); + initializeSIFixupVectorISelPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); @@ -813,6 +814,7 @@ AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); + addPass(createSIFixupVectorISelPass()); return false; } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -95,6 +95,7 @@ SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp + SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIFixWWMLiveness.cpp SIFoldOperands.cpp Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -121,6 +121,11 @@ let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } +class GlobalSaddrTable { + bit IsSaddr = is_saddr; + string SaddrOp = Name; +} + // TODO: Is exec allowed for saddr? The disabled value 0x7f is the // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). @@ -171,15 +176,19 @@ multiclass FLAT_Global_Load_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo; - def _SADDR : FLAT_Load_Pseudo; + def "" : FLAT_Load_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Load_Pseudo, + GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo; - def _SADDR : FLAT_Store_Pseudo; + def "" : FLAT_Store_Pseudo, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Store_Pseudo, + GlobalSaddrTable<1, opName>; } } @@ -287,6 +296,7 @@ (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet { let has_saddr = 1; let PseudoInstr = NAME; @@ -298,6 +308,7 @@ " $vdst, $vaddr, $vdata, off$offset glc$slc", [(set vt:$vdst, (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; } @@ -306,6 +317,7 @@ (outs), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, + GlobalSaddrTable<1, opName>, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; @@ -316,6 +328,7 @@ (outs vdst_rc:$vdst), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, + GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet { let has_saddr = 1; let enabled_saddr = 1; Index: lib/Target/AMDGPU/SIFixupVectorISel.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -0,0 +1,212 @@ +//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// SIFixupVectorISel pass cleans up post ISEL Vector issues. +/// Currently this will convert GLOBAL_{LOAD|STORE}_* +/// and GLOBAL_Atomic_* instructions into their _SADDR variants, +/// feeding the sreg into the saddr field of the new instruction. +/// We currently handle a REG_SEQUENCE feeding the vaddr +/// and decompose it into a base and index. +/// +/// Transform: +/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32 +/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32, +/// %24:vgpr_32, %19:sreg_64_xexec +/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1 +/// %11:vreg_64 = COPY %16:vreg_64 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0 +/// Into: +/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0 +/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16... +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-fixup-vector-isel" + +using namespace llvm; + +STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities"); +STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted"); + +namespace { + +class SIFixupVectorISel : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixupVectorISel() : MachineFunctionPass(ID) { + initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) + +char SIFixupVectorISel::ID = 0; + +char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; + +FunctionPass *llvm::createSIFixupVectorISelPass() { + return new SIFixupVectorISel(); +} + +static bool findSRegBaseAndIndex(MachineOperand *Op, + unsigned &BaseReg, + unsigned &IndexReg, + MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + SmallVector Worklist; + Worklist.push_back(Op); + while (!Worklist.empty()) { + MachineOperand *WOp = Worklist.pop_back_val(); + if (!WOp->isReg() || + !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + continue; + MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); + switch (DefInst->getOpcode()) { + default: + continue; + case AMDGPU::COPY: + Worklist.push_back(&DefInst->getOperand(1)); + break; + case AMDGPU::REG_SEQUENCE: + if (DefInst->getNumOperands() != 5) + continue; + Worklist.push_back(&DefInst->getOperand(1)); + Worklist.push_back(&DefInst->getOperand(3)); + break; + case AMDGPU::V_ADD_I32_e64: + if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister) + continue; + BaseReg = DefInst->getOperand(2).getReg(); + if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister) + continue; + IndexReg = DefInst->getOperand(3).getReg(); + // Chase the IndexReg. + MachineInstr * MI = MRI.getUniqueVRegDef(IndexReg); + if (!MI || !MI->isCopy()) + continue; + IndexReg = MI->getOperand(1).getReg(); + // Chase the BaseReg. + MI = MRI.getUniqueVRegDef(BaseReg); + if (!MI || !MI->isCopy()) + continue; + BaseReg = MI->getOperand(1).getReg(); + // Make sure Base is SReg and Index is VReg. + if (!TRI->isSGPRReg(MRI, BaseReg)) + return false; + if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg))) + return false; + // clear any killed flags on Index and Base regs, used later. + MRI.clearKillFlags(IndexReg); + MRI.clearKillFlags(BaseReg); + return true; + } + } + return false; +} + +// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR. +static bool fixupGlobalSaddr(MachineBasicBlock &MBB, + MachineFunction &MF, + MachineRegisterInfo &MRI, + const GCNSubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI) { + bool FuncModified = false; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode()); + if (NewOpcd < 0) + continue; + // Update our statistics on opportunities seen. + ++NumSGPRGlobalOccurs; + LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n'); + // Need a Base and Index or we cant transform to _SADDR. + unsigned BaseReg = 0; + unsigned IndexReg = 0; + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI)) + continue; + ++NumSGPRGlobalSaddrs; + FuncModified = true; + // Create the new _SADDR Memory instruction. + bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr; + MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineInstr *NewGlob = nullptr; + if (HasVdst) + NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd), + MI.getOperand(0).getReg()); + else + // No vdst field. + NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd)); + NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false)); + if (VData) + NewGlob->addOperand(MF, *VData); + NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false)); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset)); + + MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc); + // Atomics dont have a GLC, so omit the field if not there. + if (Glc) + NewGlob->addOperand(MF, *Glc); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); + // _D16 have an vdst_in operand, copy it in. + MachineOperand *VDstInOp = TII->getNamedOperand(MI, + AMDGPU::OpName::vdst_in); + if (VDstInOp) + NewGlob->addOperand(MF, *VDstInOp); + NewGlob->copyImplicitOps(MF, MI); + NewGlob->cloneMemRefs(MF, MI); + // Remove the old Global Memop instruction. + MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n'); + } + return FuncModified; +} + +bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool FuncModified = false; + for (MachineBasicBlock &MBB : MF) + // Cleanup missed Saddr opportunites from ISel. + FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI); + return FuncModified; +} Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -954,6 +954,9 @@ LLVM_READONLY int getSOPKOp(uint16_t Opcode); + LLVM_READONLY + int getGlobalSaddrOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2017,6 +2017,15 @@ let ValueCols = [["0"]]; } +// Maps a GLOBAL to its SADDR form. +def getGlobalSaddrOp : InstrMapping { + let FilterClass = "GlobalSaddrTable"; + let RowFields = ["SaddrOp"]; + let ColFields = ["IsSaddr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" Index: test/CodeGen/AMDGPU/conv2d-saddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/conv2d-saddr.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SADDRGFX9 %s + +; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4{{$}} + +define hidden amdgpu_kernel void @simpleSaddrs(i32 addrspace(1)* nocapture %dst_image, i32 addrspace(1)* nocapture readonly %src_image, i32 addrspace(1)* nocapture readonly %conv_kernel) local_unnamed_addr #0 { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep = getelementptr i32, i32 addrspace(1)* %src_image, i64 %idx + %ptr0 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 1 + %load0 = load i32, i32 addrspace(1)* %ptr0 + %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 2 + %load1 = load i32, i32 addrspace(1)* %ptr1 + %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 3 + %load2 = load i32, i32 addrspace(1)* %ptr2 + %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 4 + %load3 = load i32, i32 addrspace(1)* %ptr3 + %ptr4 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -4 + %load4 = load i32, i32 addrspace(1)* %ptr4 + %ptr5 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -3 + %load5 = load i32, i32 addrspace(1)* %ptr5 + %ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -2 + %load6 = load i32, i32 addrspace(1)* %ptr6 + %ptr7 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -1 + %load7 = load i32, i32 addrspace(1)* %ptr7 + %ptr8 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 0 + %load8 = load i32, i32 addrspace(1)* %ptr8 + %add0 = add i32 %load1, %load0 + %add1 = add i32 %load3, %load2 + %add2 = add i32 %load5, %load4 + %add3 = add i32 %load7, %load6 + %add4 = add i32 %add0, %load8 + %add5 = add i32 %add2, %add1 + %add6 = add i32 %add4, %add3 + %add7 = add i32 %add6, %add5 + %gep9 = getelementptr i32, i32 addrspace(1)* %dst_image, i64 %idx + %ptr9 = getelementptr inbounds i32, i32 addrspace(1)* %gep9, i64 1 + store volatile i32 %add7, i32 addrspace(1)* %ptr9 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -31,8 +31,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 @@ -177,8 +177,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255 @@ -362,8 +362,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -30,8 +30,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -59,8 +59,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] @@ -87,8 +87,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] Index: test/CodeGen/AMDGPU/global-load_stores.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-load_stores.mir @@ -0,0 +1,86 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel %s -o - | FileCheck -check-prefix=GCN %s + +# Coverage tests for GLOBAL_* to their _SADDR equivalent. + +# GCN-LABEL: name: global_loads_stores +# GCN: GLOBAL_LOAD_DWORD_SADDR +# GCN: GLOBAL_STORE_DWORD_SADDR +# GCN: GLOBAL_LOAD_DWORDX2_SADDR +# GCN: GLOBAL_STORE_DWORDX2_SADDR +# GCN: GLOBAL_LOAD_DWORDX3_SADDR +# GCN: GLOBAL_STORE_DWORDX3_SADDR +# GCN: GLOBAL_LOAD_DWORDX4_SADDR +# GCN: GLOBAL_STORE_DWORDX4_SADDR +# GCN: GLOBAL_LOAD_SSHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_USHORT_SADDR +# GCN: GLOBAL_STORE_SHORT_SADDR +# GCN: GLOBAL_LOAD_UBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_SADDR +# GCN: GLOBAL_STORE_BYTE_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_UBYTE_D16_HI_SADDR +# GCN: GLOBAL_STORE_BYTE_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_HI_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR +# GCN: GLOBAL_LOAD_SHORT_D16_SADDR +# GCN: GLOBAL_STORE_SHORT_D16_HI_SADDR + +name: global_loads_stores +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %0:vgpr_32 = COPY $vgpr0 + %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 ) + %5:sreg_32_xm0 = S_MOV_B32 2 + %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec + %7:sreg_32_xm0 = S_MOV_B32 0 + %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1 + %21:sgpr_32 = COPY %4.sub0 + %22:vgpr_32 = COPY %14.sub0 + %23:sgpr_32 = COPY %4.sub1 + %24:vgpr_32 = COPY %14.sub1 + %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec + %25:vgpr_32 = COPY %23 + %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec + %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1 + %11:vreg_64 = COPY %16 + %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4) + GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + S_ENDPGM +... Index: test/CodeGen/AMDGPU/global-saddr-atomics.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -0,0 +1,359 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +; GFX9-LABEL: {{^}}atomic_xor_i32_offset: +; GFX9: global_atomic_xor v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile xor i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile xor i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_xor_i64_offset: +; GFX9: global_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile xor i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile xor i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_smin_i32_offset: +; GFX9: global_atomic_smin v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_smin_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile min i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile min i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_smin_i64_offset: +; GFX9: global_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_smin_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile min i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile min i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_and_i32_offset: +; GFX9: global_atomic_and v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile and i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile and i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_and_i64_offset: +; GFX9: global_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile and i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile and i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_swap_i32_offset: +; GFX9: global_atomic_swap v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_swap_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile xchg i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile xchg i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_swap_i64_offset: +; GFX9: global_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_swap_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile xchg i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile xchg i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_smax_i32_offset: +; GFX9: global_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_smax_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile max i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile max i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_smax_i64_offset: +; GFX9: global_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_smax_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile max i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile max i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_umin_i32_offset: +; GFX9: global_atomic_umin v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile umin i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile umin i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_umin_i64_offset: +; GFX9: global_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile umin i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile umin i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_umax_i32_offset: +; GFX9: global_atomic_umax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile umax i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile umax i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_umax_i64_offset: +; GFX9: global_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile umax i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile umax i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_or_i32_offset: +; GFX9: global_atomic_or v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile or i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile or i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_or_i64_offset: +; GFX9: global_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile or i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile or i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_add_i32_offset: +; GFX9: global_atomic_add v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile add i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile add i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_add_i64_offset: +; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile add i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile add i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_sub_i32_offset: +; GFX9: global_atomic_sub v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} + +define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %ptr, i32 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile sub i32 addrspace(1)* %gep1, i32 %in seq_cst + + store volatile i32 %val, i32 addrspace(1)* undef + %val1 = atomicrmw volatile sub i32 addrspace(1)* %gep1, i32 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_sub_i64_offset: +; GFX9: global_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} + +define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %ptr, i64 %in) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = atomicrmw volatile sub i64 addrspace(1)* %gep1, i64 %in seq_cst + + store volatile i64 %val, i64 addrspace(1)* undef + %val1 = atomicrmw volatile sub i64 addrspace(1)* %gep1, i64 %in seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: +; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} +; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out +, i32 addrspace(1)* %out2, i32%in, i32 %index, i32 %old) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %out, i64 %idx + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4 + %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %extract0 = extractvalue { i32, i1 } %val, 0 + store i32 %extract0, i32 addrspace(1)* %out2 + %val2 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; GFX9-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset: +; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} +; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out +, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) { +entry: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i64, i64 addrspace(1)* %out, i64 %idx + %ptr = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4 + %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst + %extract0 = extractvalue { i64, i1 } %val, 0 + store i64 %extract0, i64 addrspace(1)* %out2 + %val2 = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() + Index: test/CodeGen/AMDGPU/global-saddr-misc.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-saddr-misc.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SADDRGFX9 %s + +; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; SADDRGFX9-NEXT: s_waitcnt +; SADDRGFX9-NOT: global_load_dword + +define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg) { +bb: + %tmp = extractelement <3 x i32> %arg, i32 1 + %tmp1 = inttoptr i32 %tmp to <4 x i32> addrspace(1)* + %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp1, align 16 + store volatile <4 x i32> %tmp2, <4 x i32> addrspace(1)* undef + ret void +} Index: test/CodeGen/AMDGPU/global-saddr-offsets.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/global-saddr-offsets.ll @@ -0,0 +1,60 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s + +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4092{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2044{{$}} +define amdgpu_kernel void @test_offsets(i32 addrspace(1)* %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 1023 + %load = load i32, i32 addrspace(1)* %gep1 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 2047 + %load1 = load i32, i32 addrspace(1)* %gep2 + %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 511 + %load2 = load i32, i32 addrspace(1)* %gep3 + %add1 = add i32 %load, %load1 + %add = add i32 %add1, %load2 + store volatile i32 %add, i32 addrspace(1)* undef + ret void +} + +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-2048{{$}} +define amdgpu_kernel void @test_offsets_neg(i32 addrspace(1)* %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -1024 + %load = load i32, i32 addrspace(1)* %gep1 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -2048 + %load1 = load i32, i32 addrspace(1)* %gep2 + %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -512 + %load2 = load i32, i32 addrspace(1)* %gep3 + %add1 = add i32 %load, %load1 + %add = add i32 %add1, %load2 + store volatile i32 %add, i32 addrspace(1)* undef + ret void +} + + +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2048{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}} +define amdgpu_kernel void @test_offsets_adjoin(i32 addrspace(1)* %ptr) { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %idx = zext i32 %id to i64 + %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 514 + %load = load i32, i32 addrspace(1)* %gep1 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 512 + %load1 = load i32, i32 addrspace(1)* %gep2 + %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 513 + %load2 = load i32, i32 addrspace(1)* %gep3 + %add1 = add i32 %load, %load1 + %add = add i32 %add1, %load2 + store volatile i32 %add, i32 addrspace(1)* undef + ret void +} +declare i32 @llvm.amdgcn.workitem.id.x() + Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -444,40 +444,12 @@ ret void } -; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 - -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] - -; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] - -; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] -; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] - -; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 - %tid.ext = sext i32 %tid to i64 - %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext - %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext - %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext - %idx = load i32, i32 addrspace(1)* %idx.gep - %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep - %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx - store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep - ret void -} - ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s + +; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: + +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] + +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 + +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] + +; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] + +; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] +; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %idx = load i32, i32 addrspace(1)* %idx.gep + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/madak.ll =================================================================== --- test/CodeGen/AMDGPU/madak.ll +++ test/CodeGen/AMDGPU/madak.ll @@ -8,8 +8,10 @@ ; GCN-LABEL: {{^}}madak_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -88,8 +90,10 @@ ; GCN-LABEL: {{^}}madak_inline_imm_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -319,7 +319,7 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32 addrspace(1)* %in, i32* %out) { entry: Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -240,7 +240,7 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} +; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} define amdgpu_kernel void @nontemporal_global_1( i32* %in, i32 addrspace(1)* %out) { entry: Index: test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- test/CodeGen/AMDGPU/memory_clause.ll +++ test/CodeGen/AMDGPU/memory_clause.ll @@ -105,7 +105,7 @@ } ; GCN-LABEL: {{^}}vector_clause_indirect: -; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off +; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -265,16 +265,15 @@ ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 +; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20 - -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() %id.ext = sext i32 %id to i64