Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -984,8 +984,8 @@ LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } - // End of for loop that looks at all source operands to decide vm_wait_cnt - // and lgk_wait_cnt. + // End of for loop that looks at all source operands to decide vm_cnt + // and lgkm_cnt. // Two cases are handled for destination operands: // 1) If the destination operand was defined by a load, add the s_waitcnt @@ -1263,12 +1263,23 @@ // they will need to retain and not clear their initial state. // See if there are any uninitialized predecessors. If so, emit an - // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *pred : Block.predecessors()) { + // s_waitcnt 0 at the beginning of the block if we're sure + // that we will *not* revisit the block. + for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[pred].get(); - bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + if (!Visited && !ScoreBrackets->getRevisitLoop()){ + // Pred not visited, so there better be no PredScoreBracket. + // Unfortunately, emit a s_waitcnt 0 since we don't have + // Pred info and we're not going to get it. + // FIXME: can we revisit this block after we've visited + // the Pred? + assert(!PredScoreBrackets); + ScoreBrackets->setWaitAtBeginning(); + break; + } + if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) { break; } for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; @@ -1305,9 +1316,19 @@ // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); + BlockWaitcntBracketsMap[Pred].get(); bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { + if (!Visited && !ScoreBrackets->getRevisitLoop()){ + // Pred not visited, so there better be no PredScoreBracket. + // Unfortunately, emit a s_waitcnt 0 since we don't have + // Pred info and we're not going to get it. + // FIXME: can we revisit this block after we've visited + // the Pred? + assert(!PredScoreBrackets); + ScoreBrackets->setWaitAtBeginning(); + break; + } + if (!PredScoreBrackets || PredScoreBrackets->getWaitAtBeginning()) { break; } Index: test/CodeGen/AMDGPU/waitcnt-no-preds.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-no-preds.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; check that the waitcnt pass inserts a S_WAITCNT 0 at the top of a +; block when the preds will never be visited prior to the block + +; CHECK: {{BB[0-9]_3:}} +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK: {{BB[0-9]_4:}} +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + +define amdgpu_ps void @main(i32 inreg %arg, <2 x float> %arg1) local_unnamed_addr { +bb: + %tmp = insertelement <2 x i32> , i32 %arg, i32 0 + %tmp2 = bitcast <2 x i32> %tmp to i64 + %tmp3 = inttoptr i64 %tmp2 to [4294967295 x i8] addrspace(2)* + %tmp4 = extractelement <2 x float> %arg1, i32 1 + %tmp5 = insertelement <4 x float> undef, float %tmp4, i32 0 + %tmp6 = inttoptr i64 %tmp2 to <4 x i32> addrspace(2)* + %tmp7 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp6, align 16 + %tmp8 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp7, i32 0, i32 0, i1 false, i1 false) + %tmp9 = bitcast float %tmp8 to i32 + %tmp10 = icmp eq i32 %tmp9, 0 + %tmp11 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp3, i64 0, i64 64 + %tmp12 = bitcast i8 addrspace(2)* %tmp11 to <4 x i32> addrspace(2)* + br label %bb13 + +bb13: ; preds = %bb22, %bb + %tmp14 = phi <4 x float> [ %tmp5, %bb ], [ %tmp23, %bb22 ] + %tmp15 = phi i32 [ 0, %bb ], [ %tmp24, %bb22 ] + br i1 %tmp10, label %bb19, label %bb16 + +bb16: ; preds = %bb13 + %tmp17 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp7, i32 0, i32 0, i1 false, i1 false) + %tmp18 = bitcast float %tmp17 to i32 + br label %bb19 + +bb19: ; preds = %bb16, %bb13 + %tmp20 = phi i32 [ %tmp18, %bb16 ], [ 0, %bb13 ] + %tmp21 = icmp slt i32 %tmp15, %tmp20 + br i1 %tmp21, label %bb22, label %bb25 + +bb22: ; preds = %bb19 + %tmp23 = shufflevector <4 x float> %tmp14, <4 x float> undef, <4 x i32> + %tmp24 = add i32 %tmp15, 1 + br label %bb13 + +bb25: ; preds = %bb19 + ret void +} + + + +; Function Attrs: nounwind readonly +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)