Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -130,10 +130,12 @@ unsigned NumInstr = 0; - for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); - MBB = *MBB->succ_begin()) { + for (MachineFunction::iterator MBBI = MachineFunction::iterator(From), + ToI = MachineFunction::iterator(To); MBBI != ToI; ++MBBI) { - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { if (I->isBundle() || !I->isBundled()) Index: test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -24,5 +24,39 @@ ret void } +;CHECK-LABEL: {{^}}test2: +;CHECK: s_and_saveexec_b64 +;CHECK: s_xor_b64 +;CHECK-NEXT: s_cbranch_execz +define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) { +main_body: + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 + %cc = icmp eq i32 %tid, 0 + br i1 %cc, label %done1, label %if + +if: + %cmp = icmp eq i32 %a, 0 + br i1 %cmp, label %done0, label %loop_body + +loop_body: + %counter = phi i32 [ 0, %if ], [0, %done0], [ %incr, %loop_body ] + + ; Prevent the loop from being optimized out + call void asm sideeffect "", "" () + + %incr = add i32 %counter, 1 + %lc = icmp sge i32 %incr, 1000 + br i1 %lc, label %done1, label %loop_body + +done0: + %cmp0 = icmp eq i32 %b, 0 + br i1 %cmp0, label %done1, label %loop_body + +done1: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind readonly }