Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -102,6 +102,7 @@ bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const; bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; + bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const; int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const; @@ -164,9 +165,11 @@ InstructionSelector::ComplexRendererFns selectMUBUFScratchOffset(MachineOperand &Root) const; - bool isDSOffsetLegal(const MachineRegisterInfo &MRI, - const MachineOperand &Base, - int64_t Offset, unsigned OffsetBits) const; + bool isDSOffsetLegal(Register Base, int64_t Offset, + unsigned OffsetBits) const; + + std::pair + selectDS1Addr1OffsetImpl(MachineOperand &Src) const; InstructionSelector::ComplexRendererFns selectDS1Addr1Offset(MachineOperand &Root) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1197,6 +1197,36 @@ return true; } +bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI, + bool IsAppend) const { + Register PtrBase = MI.getOperand(2).getReg(); + LLT PtrTy = MRI->getType(PtrBase); + bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS; + + unsigned Offset; + std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2)); + + // TODO: Should this try to look through readfirstlane like GWS? + if (!isDSOffsetLegal(PtrBase, Offset, 16)) { + PtrBase = MI.getOperand(2).getReg(); + Offset = 0; + } + + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; + + BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(PtrBase); + BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg()) + .addImm(Offset) + .addImm(IsGDS ? -1 : 0) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); @@ -1230,6 +1260,10 @@ case Intrinsic::amdgcn_ds_gws_sema_p: case Intrinsic::amdgcn_ds_gws_sema_release_all: return selectDSGWSIntrinsic(I, IntrinsicID); + case Intrinsic::amdgcn_ds_append: + return selectDSAppendConsume(I, true); + case Intrinsic::amdgcn_ds_consume: + return selectDSAppendConsume(I, false); default: return selectImpl(I, *CoverageInfo); } @@ -2248,8 +2282,7 @@ }}}; } -bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI, - const MachineOperand &Base, +bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, int64_t Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || @@ -2261,7 +2294,7 @@ // On Southern Islands instruction with a negative base value and an offset // don't seem to work. - return KnownBits->signBitIsZero(Base.getReg()); + return KnownBits->signBitIsZero(Base); } InstructionSelector::ComplexRendererFns @@ -2292,15 +2325,11 @@ }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { +std::pair +AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const { const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg()); - if (!RootDef) { - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } - }}; - } + if (!RootDef) + return std::make_pair(Root.getReg(), 0); int64_t ConstAddr = 0; if (isBaseWithConstantOffset(Root, *MRI)) { @@ -2311,26 +2340,32 @@ if (LHSDef && RHSDef) { int64_t PossibleOffset = RHSDef->getOperand(1).getCImm()->getSExtValue(); - if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) { + if (isDSOffsetLegal(LHS.getReg(), PossibleOffset, 16)) { // (add n0, c0) - return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(LHS); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); } - }}; + return std::make_pair(LHS.getReg(), PossibleOffset); } } } else if (RootDef->getOpcode() == AMDGPU::G_SUB) { - + // TODO } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) { - + // TODO } + return std::make_pair(Root.getReg(), 0); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const { + + Register Reg; + unsigned Offset; + std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root); return {{ - [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } + [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } }}; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2147,6 +2147,11 @@ constrainOpWithReadfirstlane(MI, MRI, 1); // M0 return; } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + constrainOpWithReadfirstlane(MI, MRI, 2); // M0 + return; + } case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? @@ -3080,8 +3085,6 @@ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } - case Intrinsic::amdgcn_ds_append: - case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: @@ -3098,6 +3101,13 @@ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); break; } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); + break; + } case Intrinsic::amdgcn_exp_compr: OpdsMapping[0] = nullptr; // IntrinsicID // FIXME: These are immediate values which can't be read from registers. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.append.ll @@ -0,0 +1,4 @@ +; XUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,SI,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-GISEL,GCN-GISEL %S/../llvm.amdgcn.ds.append.ll Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.consume.ll @@ -0,0 +1,4 @@ +; XUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %S/../llvm.amdgcn.ds.append.ll +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %S/../llvm.amdgcn.ds.append.ll | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %S/../llvm.amdgcn.ds.append.ll Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9,GCN-SDAG %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9,CIPLUS-SDAG,GCN-SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9,CIPLUS-SDAG,GCN-SDAG %s ; GCN-LABEL: {{^}}ds_append_lds: ; GCN: s_load_dword [[PTR:s[0-9]+]] @@ -51,10 +51,13 @@ ; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: ; GCN: s_load_dword [[PTR:s[0-9]+]] -; SI: s_bitset1_b32 [[PTR]], 16 -; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000 +; SI-SDAG: s_bitset1_b32 [[PTR]], 16 +; CIPLUS-SDAG: s_add_i32 [[PTR]], [[PTR]], 0x10000 +; GCN-SDAG: s_mov_b32 m0, [[PTR]] + +; SI-GISEL: s_bitset1_b32 m0, 16 +; CIPLUS-GISEL: s_add_u32 m0, [[PTR]], 0x10000 -; GCN: s_mov_b32 m0, [[PTR]] ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 ; GCN: {{.*}}store{{.*}} [[RESULT]] @@ -66,8 +69,11 @@ } ; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr: -; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 -; GCN: s_mov_b32 m0, [[READLANE]] +; GCN-SDAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN-SDAG: s_mov_b32 m0, [[READLANE]] + +; GCN-GISEL: v_readfirstlane_b32 m0, v0 + ; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} ; GCN-NOT: buffer_wbinvl1 ; GCN: {{.*}}store{{.*}} [[RESULT]] @@ -127,8 +133,8 @@ ret void } -declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1) #1 -declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1) #1 +declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 +declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly convergent nounwind } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -127,8 +127,8 @@ ret void } -declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1) #1 -declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1) #1 +declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1 immarg) #1 +declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1 immarg) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly convergent nounwind }