Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -984,8 +984,8 @@ LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. + // End of for loop that looks at all source operands to decide vm_cnt + // and lgkm_cnt. // Two cases are handled for destination operands: // 1) If the destination operand was defined by a load, add the s_waitcnt @@ -1263,12 +1263,23 @@ // they will need to retain and not clear their initial state. // See if there are any uninitialized predecessors. If so, emit an - // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *pred : Block.predecessors()) { + // s_waitcnt 0 at the beginning of the block if we're sure + // that we will *not* revisit the block. + for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[pred].get(); - bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + if (!Visited && !ScoreBrackets->getRevisitLoop()){ + // pred not visited, so there better be no PredScoreBracket. + // Unfortunately, emit a s_waitcnt 0 since we don't have + // pred info and we're not going to get it. + // FIXME: can we revisit this block after we've visited + // the pred? + assert(!PredScoreBrackets); + ScoreBrackets->setWaitAtBeginning(); + break; + } + if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) { break; } for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; @@ -1305,9 +1316,19 @@ // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); + if (!Visited && !ScoreBrackets->getRevisitLoop()){ + // pred not visited, so there better be no PredScoreBracket. + // Unfortunately, emit a s_waitcnt 0 since we don't have + // pred info and we're not going to get it. + // FIXME: can we revisit this block after we've visited + // the pred? + assert(!PredScoreBrackets); + ScoreBrackets->setWaitAtBeginning(); + break; + } + if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) { break; } Index: test/CodeGen/AMDGPU/waitcnt-no-preds.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-no-preds.ll @@ -0,0 +1,106 @@ +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; check that the waitcnt pass inserts a S_WAITCNT 0 at the top of a +; block when the preds will never be visited prior to the block + +; CHECK-LABEL: BB0_3: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-LABEL: BB0_4: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + +; Function Attrs: nounwind +define amdgpu_ps void @main(i32 inreg, i32 inreg, i32 inreg, i32 inreg, <2 x float>, <2 x float>, <2 x float>, <3 x float>, <2 x float>, <2 x float>, <2 x float>, float, float, float, float, float, i32, i32, i32, i32) local_unnamed_addr #0 !spirv.ExecutionModel !6 { + %21 = insertelement <2 x i32> , i32 %2, i32 0 + %22 = bitcast <2 x i32> %21 to i64 + %23 = inttoptr i64 %22 to [4294967295 x i8] addrspace(2)* + %24 = extractelement <2 x float> %5, i32 0 + %25 = extractelement <2 x float> %5, i32 1 + %26 = call float @llvm.amdgcn.interp.p1(float %24, i32 0, i32 0, i32 %3) #2 + %27 = call float @llvm.amdgcn.interp.p2(float %26, float %25, i32 0, i32 0, i32 %3) #2 + %28 = insertelement <4 x float> undef, float %27, i32 0 + %29 = call float @llvm.amdgcn.interp.p1(float %24, i32 1, i32 0, i32 %3) #2 + %30 = call float @llvm.amdgcn.interp.p2(float %29, float %25, i32 1, i32 0, i32 %3) #2 + %31 = insertelement <4 x float> %28, float %30, i32 1 + %32 = call float @llvm.amdgcn.interp.p1(float %24, i32 2, i32 0, i32 %3) #2 + %33 = call float @llvm.amdgcn.interp.p2(float %32, float %25, i32 2, i32 0, i32 %3) #2 + %34 = insertelement <4 x float> %31, float %33, i32 2 + %35 = call float @llvm.amdgcn.interp.p1(float %24, i32 3, i32 0, i32 %3) #2 + %36 = call float @llvm.amdgcn.interp.p2(float %35, float %25, i32 3, i32 0, i32 %3) #2 + %37 = insertelement <4 x float> %34, float %36, i32 3 + %38 = inttoptr i64 %22 to <4 x i32> addrspace(2)* + %39 = load <4 x i32>, <4 x i32> addrspace(2)* %38, align 16 + %40 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %39, i32 0, i32 0, i1 false, i1 false) #3 + %41 = bitcast float %40 to i32 + %42 = icmp eq i32 %41, 0 + %43 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %23, i64 0, i64 64 + %44 = bitcast i8 addrspace(2)* %43 to <4 x i32> addrspace(2)* + br label %45 + +;