Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1371,6 +1371,43 @@ [SDNPMemOperand] >; +// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) +def int_amdgcn_ds_gws_sema_v : + GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, + Intrinsic<[], + [llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; + +// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) +def int_amdgcn_ds_gws_sema_br : + GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, + Intrinsic<[], + [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; + +// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) +def int_amdgcn_ds_gws_sema_p : + GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, + Intrinsic<[], + [llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; + +// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) +def int_amdgcn_ds_gws_sema_release_all : + GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, + Intrinsic<[], + [llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; + + // Copies the source value to the destination value, with the guarantee that // the source value is computed as if the entire program were executed in WQM. def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2040,10 +2040,39 @@ CurDAG->setNodeMemRefs(cast(Selected), {MMO}); } +static unsigned gwsIntrinToOpcode(unsigned IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + return AMDGPU::DS_GWS_INIT; + case Intrinsic::amdgcn_ds_gws_barrier: + return AMDGPU::DS_GWS_BARRIER; + case Intrinsic::amdgcn_ds_gws_sema_v: + return AMDGPU::DS_GWS_SEMA_V; + case Intrinsic::amdgcn_ds_gws_sema_br: + return AMDGPU::DS_GWS_SEMA_BR; + case Intrinsic::amdgcn_ds_gws_sema_p: + return AMDGPU::DS_GWS_SEMA_P; + case Intrinsic::amdgcn_ds_gws_sema_release_all: + return AMDGPU::DS_GWS_SEMA_RELEASE_ALL; + default: + llvm_unreachable("not a gws intrinsic"); + } +} + void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { + if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all && + !Subtarget->hasGWSSemaReleaseAll()) { + // Let this error. + SelectCode(N); + return; + } + + // Chain, intrinsic ID, vsrc, offset + const bool HasVSrc = N->getNumOperands() == 4; + assert(HasVSrc || N->getNumOperands() == 3); + SDLoc SL(N); - SDValue VSrc0 = N->getOperand(2); - SDValue BaseOffset = N->getOperand(3); + SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2); int ImmOffset = 0; MemIntrinsicSDNode *M = cast(N); MachineMemOperand *MMO = M->getMemOperand(); @@ -2079,28 +2108,37 @@ glueCopyToM0(N, SDValue(M0Base, 0)); } - // The manual doesn't mention this, but it seems only v0 works. - SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); + SDValue V0; + SDValue Chain = N->getOperand(0); + SDValue Glue; + if (HasVSrc) { + SDValue VSrc0 = N->getOperand(2); + + // The manual doesn't mention this, but it seems only v0 works. + V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); - SDValue CopyToV0 = CurDAG->getCopyToReg( - N->getOperand(0), SL, V0, VSrc0, - N->getOperand(N->getNumOperands() - 1)); + SDValue CopyToV0 = CurDAG->getCopyToReg( + N->getOperand(0), SL, V0, VSrc0, + N->getOperand(N->getNumOperands() - 1)); + Chain = CopyToV0; + Glue = CopyToV0.getValue(1); + } SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); // TODO: Can this just be removed from the instruction? SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); - unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ? - AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER; + const unsigned Opc = gwsIntrinToOpcode(IntrID); + SmallVector Ops; + if (HasVSrc) + Ops.push_back(V0); + Ops.push_back(OffsetField); + Ops.push_back(GDS); + Ops.push_back(Chain); - SDValue Ops[] = { - V0, - OffsetField, - GDS, - CopyToV0, // Chain - CopyToV0.getValue(1) // Glue - }; + if (HasVSrc) + Ops.push_back(Glue); SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); CurDAG->setNodeMemRefs(cast(Selected), {MMO}); @@ -2126,6 +2164,10 @@ switch (IntrID) { case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: SelectDS_GWS(N, IntrID); return; default: Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -571,6 +571,11 @@ return CIInsts && EnableDS128; } + /// \returns if tatrget has ds_gws_sema_release_all instruction. + bool hasGWSSemaReleaseAll() const { + return CIInsts; + } + /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. bool privateMemoryResourceIsRangeChecked() const { Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -557,7 +557,9 @@ defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>; defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>; +let isConvergent = 1, usesCustomInserter = 1 in { def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; +} let mayStore = 0 in { defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -962,7 +962,11 @@ return true; } case Intrinsic::amdgcn_ds_gws_init: - case Intrinsic::amdgcn_ds_gws_barrier: { + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { Info.opc = ISD::INTRINSIC_VOID; SIMachineFunctionInfo *MFI = MF.getInfo(); @@ -2982,9 +2986,7 @@ std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); MachineBasicBlock::iterator I = LoopBB->end(); - MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); - assert(Src && "missing operand from GWS instruction"); const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); @@ -2996,7 +2998,7 @@ // This is a pain, but we're not allowed to have physical register live-ins // yet. Insert a pair of copies if the VGPR0 hack is necessary. - if (TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) .add(*Src); @@ -3723,6 +3725,7 @@ case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_BR: case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: if (getSubtarget()->hasGWSAutoReplay()) return BB; Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -542,11 +542,6 @@ // export.) if (AddrOpIdx != -1) { setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); - } else { - assert(Inst.getOpcode() == AMDGPU::DS_APPEND || - Inst.getOpcode() == AMDGPU::DS_CONSUME || - Inst.getOpcode() == AMDGPU::DS_GWS_INIT || - Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER); } if (Inst.mayStore()) { Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.br.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s + +; GCN-LABEL: {{^}}gws_sema_br_offset0: +; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP: v_mov_b32_e32 v0, [[BAR_NUM]] +; NOLOOP: ds_gws_sema_br v0 offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; LOOP-NEXT: ds_gws_sema_br v0 offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] +define amdgpu_kernel void @gws_sema_br_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.sema.br(i32 %val, i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.sema.br(i32, i32) #0 + +attributes #0 = { convergent inaccessiblememonly nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s + +; GCN-LABEL: {{^}}gws_sema_p_offset0: +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP: ds_gws_sema_p offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; GFX8-NEXT: s_nop 0 +; LOOP-NEXT: ds_gws_sema_p offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] +define amdgpu_kernel void @gws_sema_p_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.sema.p(i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.sema.p(i32) #0 + +attributes #0 = { convergent inaccessiblememonly nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll @@ -0,0 +1,28 @@ +; RUN: not llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - < %s 2>&1 | FileCheck -enable-var-scope -check-prefix=GFX6ERR %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s + +; GFX6ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.gws.sema.release.all + +; GCN-LABEL: {{^}}gws_sema_release_all_offset0: +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP: ds_gws_sema_release_all offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; GFX8-NEXT: s_nop 0 +; LOOP-NEXT: ds_gws_sema_release_all offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] +define amdgpu_kernel void @gws_sema_release_all_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.sema.release.all(i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.sema.release.all(i32) #0 + +attributes #0 = { convergent inaccessiblememonly nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s + +; GCN-LABEL: {{^}}gws_sema_v_offset0: +; NOLOOP-DAG: s_mov_b32 m0, -1{{$}} +; NOLOOP: ds_gws_sema_v offset:1 gds{{$}} + +; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: +; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 +; GFX8-NEXT: s_nop 0 +; LOOP-NEXT: ds_gws_sema_v offset:1 gds +; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) +; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0 +; LOOP-NEXT: s_cbranch_scc1 [[LOOP]] +define amdgpu_kernel void @gws_sema_v_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.sema.v(i32 0) + ret void +} + +declare void @llvm.amdgcn.ds.gws.sema.v(i32) #0 + +attributes #0 = { convergent inaccessiblememonly nounwind } Index: test/CodeGen/AMDGPU/tail-duplication-convergent.ll =================================================================== --- test/CodeGen/AMDGPU/tail-duplication-convergent.ll +++ test/CodeGen/AMDGPU/tail-duplication-convergent.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O2 -tail-dup-size=1000 -tail-dup-placement-threshold=1000 -enable-tail-merge=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -O2 -tail-dup-size=1000 -tail-dup-placement-threshold=1000 -enable-tail-merge=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Need to to trigger tail duplication this during ; MachineBlockPlacement, since calls aren't tail duplicated pre-RA. @@ -8,6 +8,7 @@ declare void @llvm.amdgcn.s.barrier() #1 declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2 +declare void @llvm.amdgcn.ds.gws.sema.release.all(i32 %offset) #2 ; barrier shouldn't be duplicated. @@ -148,6 +149,29 @@ ret void } +; GCN-LABEL: {{^}}taildup_gws_sema_release_all: +; GCN: ds_gws_sema_release_all +; GCN-NOT: ds_gws +define amdgpu_kernel void @taildup_gws_sema_release_all(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %offset) #0 { +entry: + br i1 %cond, label %bb1, label %bb2 + +bb1: + store i32 0, i32 addrspace(1)* %a + br label %call + +bb2: + store i32 1, i32 addrspace(1)* %a + br label %call + +call: + call void @llvm.amdgcn.ds.gws.sema.release.all(i32 %offset) + br label %ret + +ret: + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind convergent } attributes #2 = { convergent inaccessiblememonly nounwind }