Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -68,6 +68,8 @@ def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">, Intrinsic<[], [], [IntrConvergent]>; +def int_amdgcn_s_waitcnt_all : Intrinsic<[], [], [IntrConvergent]>; + def int_amdgcn_div_scale : Intrinsic< // 1st parameter: Numerator // 2nd parameter: Denominator Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -552,6 +552,8 @@ memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + SmallVector RemoveMI; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -610,7 +612,11 @@ // Wait for everything before a barrier. if (I->getOpcode() == AMDGPU::S_BARRIER) Changes |= insertWait(MBB, I, LastIssued); - else + else if (I->getOpcode() == AMDGPU::S_WAITCNT) { + RemoveMI.push_back(I); + insertWait(MBB, I, LastIssued); + Changes = true; + } else Changes |= insertWait(MBB, I, handleOperands(*I)); pushInstruction(MBB, I); @@ -621,5 +627,8 @@ Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); } + for (MachineInstr *I : RemoveMI) + I->eraseFromParent(); + return Changes; } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -503,6 +503,7 @@ let isConvergent = 1; } +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; @@ -2460,6 +2461,11 @@ // SOPP Patterns //===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_s_waitcnt_all), + (S_WAITCNT) +>; + // FIXME: These should be removed eventually def : Pat < (int_AMDGPU_barrier_global), Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}test1: +; CHECK: image_store +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}} +; CHECK-NEXT: image_store +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) { + call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + call void @llvm.amdgcn.s.waitcnt.all() + call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0) + ret void +} + +declare void @llvm.amdgcn.s.waitcnt.all() #0 + +declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 + +attributes #0 = { nounwind }