Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -68,6 +68,8 @@ def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrConvergent]>; +def int_amdgcn_s_waitcnt : Intrinsic<[], [llvm_i32_ty], []>; + def int_amdgcn_div_scale : Intrinsic< // 1st parameter: Numerator // 2nd parameter: Denominator Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -68,6 +68,10 @@ /// \brief Counter values we have already waited on. Counters WaitedOn; + /// \brief Counter values that we must wait on before the next counter + /// increase. + Counters DelayedWaitOn; + /// \brief Counter values for last instruction issued. Counters LastIssued; @@ -103,13 +107,17 @@ /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); + MachineBasicBlock::iterator I, + const Counters& Increment); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Counts); + /// \brief Handle existing wait instructions (from intrinsics) + void handleExistingWait(MachineBasicBlock::iterator I); + /// \brief Do we need def2def checks? bool unorderedDefines(MachineInstr &MI); @@ -287,10 +295,10 @@ } void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator I, + const Counters &Increment) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -430,16 +438,38 @@ Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } +/// \brief check whether any of the counters is non-zero +static bool countersNonZero(const Counters &Counter) { + for (unsigned i = 0; i < 3; ++i) + if (Counter.Array[i]) + return true; + return false; +} + +void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { + assert(I->getOpcode() == AMDGPU::S_WAITCNT); + + unsigned Imm = I->getOperand(0).getImm(); + Counters Counts, WaitOn; + + Counts.Named.VM = Imm & 0xF; + Counts.Named.EXP = (Imm >> 4) & 0x7; + Counts.Named.LGKM = (Imm >> 8) & 0xF; + + for (unsigned i = 0; i < 3; ++i) { + if (Counts.Array[i] <= LastIssued.Array[i]) + WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + else + WaitOn.Array[i] = 0; + } + + increaseCounters(DelayedWaitOn, WaitOn); +} + Counters SIInsertWaits::handleOperands(MachineInstr &MI) { Counters Result = ZeroCounts; - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - // For each register affected by this instruction increase the result // sequence. // @@ -544,6 +574,7 @@ MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; + DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -552,6 +583,8 @@ memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + SmallVector RemoveMI; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -607,13 +640,34 @@ I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) TII->insertWaitStates(MBB, std::next(I), 4); + // Record pre-existing, explicitly requested waits + if (I->getOpcode() == AMDGPU::S_WAITCNT) { + handleExistingWait(*I); + RemoveMI.push_back(I); + continue; + } + + Counters Required; + // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); + // + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (I->getOpcode() == AMDGPU::S_BARRIER || + I->getOpcode() == AMDGPU::S_SENDMSG) + Required = LastIssued; else - Changes |= insertWait(MBB, I, handleOperands(*I)); + Required = handleOperands(*I); + + Counters Increment = getHwCounts(*I); - pushInstruction(MBB, I); + if (countersNonZero(Required) || countersNonZero(Increment)) + increaseCounters(Required, DelayedWaitOn); + + Changes |= insertWait(MBB, I, Required); + + pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } @@ -621,5 +675,8 @@ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + for (MachineInstr *I : RemoveMI) + I->eraseFromParent(); + return Changes; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -43,8 +43,9 @@ let ParserMethod = "parseSWaitCntOps"; } -def WAIT_FLAG : InstFlag<"printWaitFlag"> { +def WAIT_FLAG : Operand { let ParserMatchClass = SWaitMatchClass; + let PrintMethod = "printWaitFlag"; } let SubtargetPredicate = isGCN in { @@ -506,6 +507,7 @@ let isConvergent = 1; } +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; @@ -2452,6 +2454,11 @@ // SOPP Patterns //===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_s_waitcnt i32:$simm16), + (S_WAITCNT (as_i16imm $simm16)) +>; + // FIXME: These should be removed eventually def : Pat < (int_AMDGPU_barrier_global), Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}test1: +; CHECK: image_store +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}} +; CHECK-NEXT: image_store +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) { + call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00 + call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + ret void +} + +; Test that the intrinsic is merged with automatically generated waits and +; emitted as late as possible. +; +; CHECK-LABEL: {{^}}test2: +; CHECK: image_load +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: s_waitcnt +; CHECK-NEXT: image_store +define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) { + %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00 + %c.1 = mul i32 %c, 2 + call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.s.waitcnt(i32) #0 + +declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly }