diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1811,6 +1811,25 @@ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +//===----------------------------------------------------------------------===// +// GFX9 Intrinsics +//===----------------------------------------------------------------------===// + +class AMDGPUGlobalLoadLDS : Intrinsic < + [], + [LLVMQualPointerType, // Base global pointer to load from + LLVMQualPointerType, // LDS base pointer to store to + llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // imm offset (applied to both global and LDS address) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, + // bit 1 = slc/sc1, + // bit 2 = dlc on gfx10+)) + // bit 4 = scc/nt on gfx90a+)) + [IntrWillReturn, NoCapture>, NoCapture>, + ImmArg>, ImmArg>, ImmArg>, ImmArg>], + "", [SDNPMemOperand]>; +def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; + //===----------------------------------------------------------------------===// // GFX10 Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -144,6 +144,7 @@ bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, MachineOperand &DataOp) const; bool selectBufferLoadLds(MachineInstr &MI) const; + bool selectGlobalLoadLds(MachineInstr &MI) const; bool selectBVHIntrinsic(MachineInstr &I) const; bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1783,6 +1783,8 @@ case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: return selectBufferLoadLds(I); + case Intrinsic::amdgcn_global_load_lds: + return selectGlobalLoadLds(I); default: { return selectImpl(I, *CoverageInfo); } @@ -3149,6 +3151,106 @@ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } +/// Match a zero extend from a 32-bit value to 64-bits. +static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { + Register ZExtSrc; + if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) + return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) + const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return false; + + if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { + return Def->getOperand(1).getReg(); + } + + return Register(); +} + +bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ + unsigned Opc; + unsigned Size = MI.getOperand(3).getImm(); + + switch (Size) { + default: + return false; + case 1: + Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; + break; + case 2: + Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; + break; + case 4: + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; + break; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .add(MI.getOperand(2)); + + Register Addr = MI.getOperand(1).getReg(); + Register VOffset; + // Try to split SAddr and VOffset. Global and LDS pointers share the same + // immediate offset, so we cannot use a regular SelectGlobalSAddr(). + if (!isSGPR(Addr)) { + auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI); + if (isSGPR(AddrDef->Reg)) { + Addr = AddrDef->Reg; + } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) { + Register SAddr = + getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI); + if (SAddr && isSGPR(SAddr)) { + Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg(); + if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) { + Addr = SAddr; + VOffset = Off; + } + } + } + } + + if (isSGPR(Addr)) { + Opc = AMDGPU::getGlobalSaddrOp(Opc); + if (!VOffset) { + VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset) + .addImm(0); + } + } + + auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) + .addReg(Addr); + + if (isSGPR(Addr)) + MIB.addReg(VOffset); + + MIB.add(MI.getOperand(4)) // offset + .add(MI.getOperand(5)); // cpol + + MachineMemOperand *LoadMMO = *MI.memoperands_begin(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = MI.getOperand(4).getImm(); + MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = + MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), Align(4)); + + MIB.setMemRefs({LoadMMO, StoreMMO}); + + MI.eraseFromParent(); + return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); +} + bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{ MI.setDesc(TII.get(MI.getOperand(1).getImm())); MI.removeOperand(1); @@ -3687,24 +3789,6 @@ }}; } -/// Match a zero extend from a 32-bit value to 64-bits. -static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { - Register ZExtSrc; - if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc)))) - return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register(); - - // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) - const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); - if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) - return false; - - if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) { - return Def->getOperand(1).getReg(); - } - - return Register(); -} - // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3026,6 +3026,11 @@ constrainOpWithReadfirstlane(MI, MRI, 6); // soffset return; } + case Intrinsic::amdgcn_global_load_lds: { + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 2); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -4517,6 +4522,11 @@ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } + case Intrinsic::amdgcn_global_load_lds: { + OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } default: return getInvalidInstructionMapping(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1318,6 +1318,14 @@ Info.flags |= MachineMemOperand::MOStore; return true; } + case Intrinsic::amdgcn_global_load_lds: { + Info.opc = ISD::INTRINSIC_VOID; + unsigned Width = cast(CI.getArgOperand(2))->getZExtValue(); + Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; + return true; + } default: return false; } @@ -8318,6 +8326,81 @@ return SDValue(Load, 0); } + case Intrinsic::amdgcn_global_load_lds: { + unsigned Opc; + unsigned Size = Op->getConstantOperandVal(4); + switch (Size) { + default: + return SDValue(); + case 1: + Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE; + break; + case 2: + Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT; + break; + case 4: + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; + break; + } + + auto *M = cast(Op); + SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); + + SmallVector Ops; + + SDValue Addr = Op.getOperand(2); // Global ptr + SDValue VOffset; + // Try to split SAddr and VOffset. Global and LDS pointers share the same + // immediate offset, so we cannot use a regular SelectGlobalSAddr(). + if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { + SDValue LHS = Addr.getOperand(0); + SDValue RHS = Addr.getOperand(1); + + if (LHS->isDivergent()) + std::swap(LHS, RHS); + + if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOperand(0).getValueType() == MVT::i32) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + Addr = LHS; + VOffset = RHS.getOperand(0); + } + } + + Ops.push_back(Addr); + if (!Addr->isDivergent()) { + Opc = AMDGPU::getGlobalSaddrOp(Opc); + if (!VOffset) + VOffset = SDValue( + DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, + DAG.getTargetConstant(0, DL, MVT::i32)), 0); + Ops.push_back(VOffset); + } + + Ops.push_back(Op.getOperand(5)); // Offset + Ops.push_back(Op.getOperand(6)); // CPol + Ops.push_back(M0Val.getValue(0)); // Chain + Ops.push_back(M0Val.getValue(1)); // Glue + + MachineMemOperand *LoadMMO = M->getMemOperand(); + MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); + LoadPtrI.Offset = Op->getConstantOperandVal(5); + MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; + StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; + auto F = LoadMMO->getFlags() & + ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); + LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, + Size, LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = + MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, + sizeof(int32_t), Align(4)); + + auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); + DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); + + return SDValue(Load, 0); + } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -435,6 +435,8 @@ DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); if (DataOpIdx == -1) DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + if (DataOpIdx == -1) // LDS DMA + return false; Width = getOpSize(LdSt, DataOpIdx); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900 +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX90A +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX940 +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX900-GISEL + +declare void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr, i32 %size, i32 %offset, i32 %aux) + +define amdgpu_ps void @global_load_lds_dword_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) { +; GFX900-LABEL: global_load_lds_dword_vaddr: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-NEXT: s_mov_b32 m0, s0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dword v[0:1], off offset:16 glc lds +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: global_load_lds_dword_vaddr: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_readfirstlane_b32 s0, v2 +; GFX90A-NEXT: s_mov_b32 m0, s0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v[0:1], off offset:16 glc lds +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_load_lds_dword_vaddr: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: v_readfirstlane_b32 s0, v2 +; GFX940-NEXT: s_mov_b32 m0, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: global_load_lds_dword v[0:1], off offset:16 sc0 +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_vaddr: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: s_mov_b32 m0, s0 +; GFX10-NEXT: global_load_dword v[0:1], off offset:16 glc lds +; GFX10-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: global_load_lds_dword_vaddr: +; GFX900-GISEL: ; %bb.0: ; %main_body +; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 +; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: global_load_dword v[0:1], off offset:16 glc lds +; GFX900-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_saddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr) { +; GFX900-LABEL: global_load_lds_dword_saddr: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: v_readfirstlane_b32 s2, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: s_mov_b32 m0, s2 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dword v1, s[0:1] offset:32 slc lds +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: global_load_lds_dword_saddr: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_readfirstlane_b32 s2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v1, s[0:1] offset:32 slc lds +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_load_lds_dword_saddr: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: v_readfirstlane_b32 s2, v0 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_mov_b32 m0, s2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: global_load_lds_dword v1, s[0:1] offset:32 nt +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_saddr: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v0, s[0:1] offset:32 slc lds +; GFX10-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: global_load_lds_dword_saddr: +; GFX900-GISEL: ; %bb.0: ; %main_body +; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-GISEL-NEXT: s_nop 3 +; GFX900-GISEL-NEXT: global_load_dword v0, s[0:1] offset:32 slc lds +; GFX900-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 4, i32 32, i32 2) + ret void +} + +define amdgpu_ps void @global_load_lds_dword_saddr_and_vaddr(i8 addrspace(1)* nocapture inreg %gptr, i8 addrspace(3)* nocapture %lptr, i32 %voffset) { +; GFX900-LABEL: global_load_lds_dword_saddr_and_vaddr: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: v_readfirstlane_b32 s2, v0 +; GFX900-NEXT: s_mov_b32 m0, s2 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_dword v1, s[0:1] offset:48 lds +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: global_load_lds_dword_saddr_and_vaddr: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_readfirstlane_b32 s2, v0 +; GFX90A-NEXT: s_mov_b32 m0, s2 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dword v1, s[0:1] offset:48 scc lds +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_load_lds_dword_saddr_and_vaddr: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: v_readfirstlane_b32 s2, v0 +; GFX940-NEXT: s_mov_b32 m0, s2 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1 +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_dword_saddr_and_vaddr: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: s_mov_b32 m0, s2 +; GFX10-NEXT: global_load_dword v1, s[0:1] offset:48 lds +; GFX10-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: global_load_lds_dword_saddr_and_vaddr: +; GFX900-GISEL: ; %bb.0: ; %main_body +; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: global_load_dword v1, s[0:1] offset:48 lds +; GFX900-GISEL-NEXT: s_endpgm +main_body: + %voffset.64 = zext i32 %voffset to i64 + %gep = getelementptr i8, i8 addrspace(1)* %gptr, i64 %voffset.64 + call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gep, i8 addrspace(3)* %lptr, i32 4, i32 48, i32 16) + ret void +} + +define amdgpu_ps void @global_load_lds_ushort_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) { +; GFX900-LABEL: global_load_lds_ushort_vaddr: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-NEXT: s_mov_b32 m0, s0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_ushort v[0:1], off lds +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: global_load_lds_ushort_vaddr: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_readfirstlane_b32 s0, v2 +; GFX90A-NEXT: s_mov_b32 m0, s0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_ushort v[0:1], off lds +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_load_lds_ushort_vaddr: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: v_readfirstlane_b32 s0, v2 +; GFX940-NEXT: s_mov_b32 m0, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: global_load_lds_ushort v[0:1], off +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_ushort_vaddr: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: s_mov_b32 m0, s0 +; GFX10-NEXT: global_load_ushort v[0:1], off dlc lds +; GFX10-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: global_load_lds_ushort_vaddr: +; GFX900-GISEL: ; %bb.0: ; %main_body +; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 +; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: global_load_ushort v[0:1], off lds +; GFX900-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 2, i32 0, i32 4) + ret void +} + +define amdgpu_ps void @global_load_lds_ubyte_vaddr(i8 addrspace(1)* nocapture %gptr, i8 addrspace(3)* nocapture %lptr) { +; GFX900-LABEL: global_load_lds_ubyte_vaddr: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: v_readfirstlane_b32 s0, v2 +; GFX900-NEXT: s_mov_b32 m0, s0 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_ubyte v[0:1], off lds +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: global_load_lds_ubyte_vaddr: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: v_readfirstlane_b32 s0, v2 +; GFX90A-NEXT: s_mov_b32 m0, s0 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_ubyte v[0:1], off lds +; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_load_lds_ubyte_vaddr: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: v_readfirstlane_b32 s0, v2 +; GFX940-NEXT: s_mov_b32 m0, s0 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: global_load_lds_ubyte v[0:1], off +; GFX940-NEXT: s_endpgm +; +; GFX10-LABEL: global_load_lds_ubyte_vaddr: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: s_mov_b32 m0, s0 +; GFX10-NEXT: global_load_ubyte v[0:1], off lds +; GFX10-NEXT: s_endpgm +; +; GFX900-GISEL-LABEL: global_load_lds_ubyte_vaddr: +; GFX900-GISEL: ; %bb.0: ; %main_body +; GFX900-GISEL-NEXT: v_readfirstlane_b32 m0, v2 +; GFX900-GISEL-NEXT: s_nop 4 +; GFX900-GISEL-NEXT: global_load_ubyte v[0:1], off lds +; GFX900-GISEL-NEXT: s_endpgm +main_body: + call void @llvm.amdgcn.global.load.lds(i8 addrspace(1)* %gptr, i8 addrspace(3)* %lptr, i32 1, i32 0, i32 0) + ret void +}