Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1348,6 +1348,28 @@ [IntrNoMem, IntrSpeculatable] >; +// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) +// +// bar_val is the total number of waves that will wait on this +// barrier, minus 1. +def int_amdgcn_ds_gws_init : + GCCBuiltin<"__builtin_amdgcn_ds_gws_init">, + Intrinsic<[], + [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; + +// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) +// bar_val is the total number of waves that will wait on this +// barrier, minus 1. +def int_amdgcn_ds_gws_barrier : + GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">, + Intrinsic<[], + [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly], "", + [SDNPMemOperand] +>; // Copies the source value to the destination value, with the guarantee that // the source value is computed as if the entire program were executed in WQM. Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -218,7 +218,9 @@ void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); + void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectINTRINSIC_W_CHAIN(SDNode *N); + void SelectINTRINSIC_VOID(SDNode *N); protected: // Include the pieces autogenerated from the target description. @@ -832,6 +834,10 @@ SelectINTRINSIC_W_CHAIN(N); return; } + case ISD::INTRINSIC_VOID: { + SelectINTRINSIC_VOID(N); + return; + } } SelectCode(N); @@ -2024,6 +2030,73 @@ CurDAG->setNodeMemRefs(cast(Selected), {MMO}); } +void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { + SDLoc SL(N); + SDValue VSrc0 = N->getOperand(2); + SDValue BaseOffset = N->getOperand(3); + int ImmOffset = 0; + SDNode *CopyToM0; + MemIntrinsicSDNode *M = cast(N); + MachineMemOperand *MMO = M->getMemOperand(); + + // Don't worry if the offset ends up in a VGPR. Only one lane will have + // effect, so SIFixSGPRCopies will validly insert readfirstlane. + + // The resource id offset is computed as ( + M0[21:16] + + // offset field) % 64. Some versions of the programming guide omit the m0 + // part, or claim it's from offset 0. + if (ConstantSDNode *ConstOffset = dyn_cast(BaseOffset)) { + // If we have a constant offset, try to use the default value for m0 as a + // base to possibly avoid setting it up. + CopyToM0 = glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32)); + ImmOffset = ConstOffset->getZExtValue() + 1; + } else { + if (CurDAG->isBaseWithConstantOffset(BaseOffset)) { + ImmOffset = BaseOffset.getConstantOperandVal(1); + BaseOffset = BaseOffset.getOperand(0); + } + + // Prefer to do the shift in an SGPR since it should be possible to use m0 + // as the result directly. If it's already an SGPR, it will be eliminated + // later. + SDNode *SGPROffset + = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32, + BaseOffset); + // Shift to offset in m0 + SDNode *M0Base + = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32, + SDValue(SGPROffset, 0), + CurDAG->getTargetConstant(16, SL, MVT::i32)); + CopyToM0 = glueCopyToM0(N, SDValue(M0Base, 0)); + } + + // The manual doesn't mention this, but it seems only v0 works. + SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32); + + SDValue CopyToV0 = CurDAG->getCopyToReg( + SDValue(CopyToM0, 0), SL, V0, VSrc0, + N->getOperand(N->getNumOperands() - 1)); + + SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); + + // TODO: Can this just be removed from the instruction? + SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); + + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ? + AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER; + + SDValue Ops[] = { + V0, + OffsetField, + GDS, + CopyToV0, // Chain + CopyToV0.getValue(1) // Glue + }; + + SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + CurDAG->setNodeMemRefs(cast(Selected), {MMO}); +} + void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); switch (IntrID) { @@ -2034,6 +2107,18 @@ SelectDSAppendConsume(N, IntrID); return; } + } + + SelectCode(N); +} + +void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + SelectDS_GWS(N, IntrID); + return; default: break; } Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -467,11 +467,15 @@ defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; -def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; +let isConvergent = 1 in { +def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> { + let mayLoad = 0; +} def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">; def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">; def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; +} def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -961,6 +961,24 @@ return true; } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: { + Info.opc = ISD::INTRINSIC_VOID; + + SIMachineFunctionInfo *MFI = MF.getInfo(); + Info.ptrVal = + MFI->getGWSPSV(*MF.getSubtarget().getInstrInfo()); + + // This is an abstract access, but we need to specify a type and size. + Info.memVT = MVT::i32; + Info.size = 4; + Info.align = 4; + + Info.flags = MachineMemOperand::MOStore; + if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) + Info.flags = MachineMemOperand::MOLoad; + return true; + } default: return false; } Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -536,15 +536,19 @@ // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { + int AddrOpIdx = + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); // All GDS operations must protect their address register (same as // export.) - if (Inst.getOpcode() != AMDGPU::DS_APPEND && - Inst.getOpcode() != AMDGPU::DS_CONSUME) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr), - CurrScore); + if (AddrOpIdx != -1) { + setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore); + } else { + assert(Inst.getOpcode() == AMDGPU::DS_APPEND || + Inst.getOpcode() == AMDGPU::DS_CONSUME || + Inst.getOpcode() == AMDGPU::DS_GWS_INIT || + Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER); } + if (Inst.mayStore()) { if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0) != -1) { @@ -1407,18 +1411,6 @@ ScoreBrackets.dump(); }); - // Check to see if this is a GWS instruction. If so, and if this is CI or - // VI, then the generated code sequence will include an S_WAITCNT 0. - // TODO: Are these the only GWS instructions? - if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR || - Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || - Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { - // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); - } - // TODO: Remove this work-around after fixing the scheduler and enable the // assert above. if (VCCZBugWorkAround) { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2491,7 +2491,8 @@ // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || - Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP) + Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP || + Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER) return true; if (MI.isCall() || MI.isInlineAsm()) Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -43,7 +43,8 @@ public: enum AMDGPUPSVKind : unsigned { PSVBuffer = PseudoSourceValue::TargetCustom, - PSVImage + PSVImage, + GWSResource }; protected: @@ -87,6 +88,30 @@ } }; +class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue { +public: + explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII) + : AMDGPUPseudoSourceValue(GWSResource, TII) {} + + static bool classof(const PseudoSourceValue *V) { + return V->kind() == GWSResource; + } + + // These are inaccessible memory from IR. + bool isAliased(const MachineFrameInfo *) const override { + return false; + } + + // These are inaccessible memory from IR. + bool mayAlias(const MachineFrameInfo *) const override { + return false; + } + + void printCustom(raw_ostream &OS) const override { + OS << "GWSResource"; + } +}; + namespace yaml { struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { @@ -188,6 +213,7 @@ std::unique_ptr> BufferPSVs; DenseMap> ImagePSVs; + std::unique_ptr GWSResourcePSV; private: unsigned LDSWaveSpillSize = 0; @@ -674,6 +700,15 @@ return PSV.first->second.get(); } + const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) { + if (!GWSResourcePSV) { + GWSResourcePSV = + llvm::make_unique(TII); + } + + return GWSResourcePSV.get(); + } + unsigned getOccupancy() const { return Occupancy; } Index: test/CodeGen/AMDGPU/gws-hazards.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/gws-hazards.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s +# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s + +--- +name: m0_gws_init0 +tracksRegLiveness: true +body: | + + bb.0: + liveins: $vgpr0 + ; GFX9-LABEL: name: m0_gws_init0 + ; GFX9: liveins: $vgpr0 + ; GFX9: $m0 = S_MOV_B32 -1 + ; GFX9: S_NOP 0 + ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; VI-LABEL: name: m0_gws_init0 + ; VI: liveins: $vgpr0 + ; VI: $m0 = S_MOV_B32 -1 + ; VI: S_NOP 0 + ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; CI-LABEL: name: m0_gws_init0 + ; CI: liveins: $vgpr0 + ; CI: $m0 = S_MOV_B32 -1 + ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; SI-LABEL: name: m0_gws_init0 + ; SI: liveins: $vgpr0 + ; SI: $m0 = S_MOV_B32 -1 + ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + +... + +--- +name: m0_gws_init1 +tracksRegLiveness: true +body: | + + bb.0: + ; GFX9-LABEL: name: m0_gws_init1 + ; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: $m0 = S_MOV_B32 -1 + ; GFX9: S_NOP 0 + ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; VI-LABEL: name: m0_gws_init1 + ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; VI: $m0 = S_MOV_B32 -1 + ; VI: S_NOP 0 + ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; CI-LABEL: name: m0_gws_init1 + ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CI: $m0 = S_MOV_B32 -1 + ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; SI-LABEL: name: m0_gws_init1 + ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; SI: $m0 = S_MOV_B32 -1 + ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $m0 = S_MOV_B32 -1 + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + +... + +# Test a typical situation where m0 needs to be set from a VGPR +# through readfirstlane +--- +name: m0_gws_readlane +tracksRegLiveness: true +body: | + + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX9-LABEL: name: m0_gws_readlane + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; GFX9: $m0 = S_MOV_B32 $sgpr0 + ; GFX9: S_NOP 0 + ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; VI-LABEL: name: m0_gws_readlane + ; VI: liveins: $vgpr0, $vgpr1 + ; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; VI: $m0 = S_MOV_B32 $sgpr0 + ; VI: S_NOP 0 + ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; CI-LABEL: name: m0_gws_readlane + ; CI: liveins: $vgpr0, $vgpr1 + ; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; CI: $m0 = S_MOV_B32 $sgpr0 + ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; SI-LABEL: name: m0_gws_readlane + ; SI: liveins: $vgpr0, $vgpr1 + ; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + ; SI: $m0 = S_MOV_B32 $sgpr0 + ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec + $m0 = S_MOV_B32 $sgpr0 + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + +... Index: test/CodeGen/AMDGPU/insert-skips-gws.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert-skips-gws.mir @@ -0,0 +1,59 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 + +--- + +name: skip_gws_init +body: | + ; CHECK-LABEL: name: skip_gws_init + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + SI_MASK_BRANCH %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec + + bb.2: + S_ENDPGM 0 +... + +--- + +name: skip_gws_barrier +body: | + ; CHECK-LABEL: name: skip_gws_barrier + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec + ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec + ; CHECK: bb.2: + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1, %bb.2 + SI_MASK_BRANCH %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec + + bb.2: + S_ENDPGM 0 +... Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -0,0 +1,179 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s + +; Minimum offset +; GCN-LABEL: {{^}}gws_barrier_offset0: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: s_mov_b32 m0, -1{{$}} +; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_barrier v0 offset:1 gds{{$}} +define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) + ret void +} + +; Maximum offset +; GCN-LABEL: {{^}}gws_barrier_offset63: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: s_mov_b32 m0, -1{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_barrier v0 offset:64 gds{{$}} +define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63) + ret void +} + +; FIXME: Should be able to shift directly into m0 +; GCN-LABEL: {{^}}gws_barrier_sgpr_offset: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; GCN: ds_gws_barrier v0 gds{{$}} +define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) + ret void +} + +; Variable offset in SGPR with constant add +; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; GCN: ds_gws_barrier v0 offset:1 gds{{$}} +define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { + %offset = add i32 %offset.base, 1 + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_vgpr_offset: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_barrier v0 gds{{$}} +define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 { + %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) + ret void +} + +; Variable offset in VGPR with constant add +; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_barrier v0 offset:3 gds{{$}} +define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 { + %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() + %vgpr.offset = add i32 %vgpr.offset.base, 3 + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset) + ret void +} + +@lds = internal unnamed_addr addrspace(3) global i32 undef + +; Check if m0 initialization is shared +; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset: +; GCN: s_mov_b32 m0, -1 +; GCN-NOT: s_mov_b32 m0 +define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 { + store i32 1, i32 addrspace(3)* @lds + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10) + store i32 2, i32 addrspace(3)* @lds + ret void +} + +; Make sure this increments lgkmcnt +; GCN-LABEL: {{^}}gws_barrier_lgkmcnt: +; GCN: ds_gws_barrier v0 offset:1 gds{{$}} +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @gws_barrier_lgkmcnt(i32 %val) { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0) + ret void +} + +; Does not imply memory fence on its own +; GCN-LABEL: {{^}}gws_barrier_wait_before: +; GCN: store_dword +; CIPLUS-NOT: s_waitcnt +; GCN: ds_gws_barrier v0 offset:8 gds +define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { + store i32 0, i32 addrspace(1)* %ptr + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_wait_after: +; GCN: ds_gws_barrier v0 offset:8 gds +; GCN-NEXT: s_waitcnt expcnt(0){{$}} +; GCN-NEXT: load_dword +define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + %load = load volatile i32, i32 addrspace(1)* %ptr + ret void +} + +; Does not imply memory fence on its own +; GCN-LABEL: {{^}}gws_barrier_fence_before: +; GCN: store_dword +; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN: ds_gws_barrier v0 offset:8 gds +define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 { + store i32 0, i32 addrspace(1)* %ptr + fence release + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + ret void +} + +; GCN-LABEL: {{^}}gws_barrier_fence_after: +; GCN: ds_gws_barrier v0 offset:8 gds +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: load_dword +define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + fence release + %load = load volatile i32, i32 addrspace(1)* %ptr + ret void +} + +; FIXME: Should a wait be inserted here, or is an explicit fence needed? +; GCN-LABEL: {{^}}gws_init_barrier: +; GCN: s_mov_b32 m0, -1 +; GCN: ds_gws_init v0 offset:8 gds +; GCN-NEXT: ds_gws_barrier v0 offset:8 gds +define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + ret void +} + +; FIXME: Why vmcnt, not expcnt? +; GCN-LABEL: {{^}}gws_init_fence_barrier: +; GCN: s_mov_b32 m0, -1 +; GCN: ds_gws_init v0 offset:8 gds +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_gws_barrier v0 offset:8 gds +define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) + fence release + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) + ret void +} + +declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1 +declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 +declare i32 @llvm.amdgcn.workitem.id.x() #3 + +attributes #0 = { nounwind } +attributes #1 = { convergent inaccessiblememonly nounwind } +attributes #2 = { convergent inaccessiblememonly nounwind writeonly } +attributes #3 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll @@ -0,0 +1,119 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +; Minimum offset +; GCN-LABEL: {{^}}gws_init_offset0: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: s_mov_b32 m0, -1{{$}} +; GCN: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_init v0 offset:1 gds{{$}} +define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) + ret void +} + +; Maximum offset +; GCN-LABEL: {{^}}gws_init_offset63: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: s_mov_b32 m0, -1{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_init v0 offset:64 gds{{$}} +define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63) + ret void +} + +; FIXME: Should be able to shift directly into m0 +; GCN-LABEL: {{^}}gws_init_sgpr_offset: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; GCN: ds_gws_init v0 gds{{$}} +define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) + ret void +} + +; Variable offset in SGPR with constant add +; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}} +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]] +; GCN: ds_gws_init v0 offset:1 gds{{$}} +define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 { + %offset = add i32 %offset.base, 1 + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) + ret void +} + +; GCN-LABEL: {{^}}gws_init_vgpr_offset: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_init v0 gds{{$}} +define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 { + %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x() + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) + ret void +} + +; Variable offset in VGPR with constant add +; GCN-LABEL: {{^}}gws_init_vgpr_offset_add: +; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]] +; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16 +; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]] +; GCN: ds_gws_init v0 offset:3 gds{{$}} +define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 { + %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x() + %vgpr.offset = add i32 %vgpr.offset.base, 3 + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset) + ret void +} + +@lds = internal unnamed_addr addrspace(3) global i32 undef + +; Check if m0 initialization is shared. +; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset: +; GCN: s_mov_b32 m0, -1 +; GCN-NOT: s_mov_b32 m0 +define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 { + store i32 1, i32 addrspace(3)* @lds + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10) + store i32 2, i32 addrspace(3)* @lds + ret void +} + +; GCN-LABEL: {{^}}gws_init_lgkmcnt: +; GCN: ds_gws_init v0 offset:1 gds{{$}} +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 +define void @gws_init_lgkmcnt(i32 %val) { + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0) + ret void +} + +; Does not imply memory fence on its own +; GCN-LABEL: {{^}}gws_init_wait_before: +; GCN: store_dword +; CIPLUS-NOT: s_waitcnt +; GCN: ds_gws_init v0 offset:8 gds +define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 { + store i32 0, i32 addrspace(1)* %ptr + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7) + ret void +} + +declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #2 + +attributes #0 = { nounwind } +attributes #1 = { convergent inaccessiblememonly nounwind writeonly } +attributes #2 = { nounwind readnone speculatable } Index: test/CodeGen/AMDGPU/tail-duplication-convergent.ll =================================================================== --- test/CodeGen/AMDGPU/tail-duplication-convergent.ll +++ test/CodeGen/AMDGPU/tail-duplication-convergent.ll @@ -6,6 +6,8 @@ declare void @nonconvergent_func() #0 declare void @convergent_func() #1 declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2 +declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2 ; barrier shouldn't be duplicated. @@ -100,6 +102,52 @@ ret void } +; GCN-LABEL: {{^}}taildup_gws_init: +; GCN: ds_gws_init +; GCN-NOT: ds_gws_init +define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 { +entry: + br i1 %cond, label %bb1, label %bb2 + +bb1: + store i32 0, i32 addrspace(1)* %a + br label %call + +bb2: + store i32 1, i32 addrspace(1)* %a + br label %call + +call: + call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset) + br label %ret + +ret: + ret void +} + +; GCN-LABEL: {{^}}taildup_gws_barrier: +; GCN: ds_gws_barrier +; GCN-NOT: ds_gws_barrier +define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 { +entry: + br i1 %cond, label %bb1, label %bb2 + +bb1: + store i32 0, i32 addrspace(1)* %a + br label %call + +bb2: + store i32 1, i32 addrspace(1)* %a + br label %call + +call: + call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset) + br label %ret + +ret: + ret void +} attributes #0 = { nounwind } attributes #1 = { nounwind convergent } +attributes #2 = { convergent inaccessiblememonly nounwind }