Index: llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp +++ llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp @@ -108,8 +108,8 @@ void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End, DenseSet &InfluenceRegion); // Finds all users of I that are outside the influence region, and add these - // users to Worklist. - void findUsersOutsideInfluenceRegion( + // users to Worklist. Returns "true" if at least one user was found. + bool findUsersOutsideInfluenceRegion( Instruction &I, const DenseSet &InfluenceRegion); Function &F; @@ -197,8 +197,15 @@ // dominators of TI until it is outside the influence region. BasicBlock *InfluencedBB = ThisBB; while (InfluenceRegion.count(InfluencedBB)) { - for (auto &I : *InfluencedBB) - findUsersOutsideInfluenceRegion(I, InfluenceRegion); + for (auto &I : *InfluencedBB) { + if (findUsersOutsideInfluenceRegion(I, InfluenceRegion)) { + // If the uniform value defined inside the loop with divergent exit + // is used outside the loop, the copy introduced by the LCSSA + // to normal form transformation will yeild incorrect value. + DV.insert(&I); + Worklist.push_back(&I); + } + } DomTreeNode *IDomNode = DT.getNode(InfluencedBB)->getIDom(); if (IDomNode == nullptr) break; @@ -206,15 +213,18 @@ } } -void DivergencePropagator::findUsersOutsideInfluenceRegion( +bool DivergencePropagator::findUsersOutsideInfluenceRegion( Instruction &I, const DenseSet &InfluenceRegion) { + bool Found = false; for (User *U : I.users()) { Instruction *UserInst = cast(U); if (!InfluenceRegion.count(UserInst->getParent())) { if (DV.insert(UserInst).second) Worklist.push_back(UserInst); + Found = true; } } + return Found; } // A helper function for computeInfluenceRegion that adds successors of "ThisBB" Index: llvm/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll =================================================================== --- llvm/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll +++ llvm/test/Analysis/LegacyDivergenceAnalysis/NVPTX/diverge.ll @@ -104,7 +104,7 @@ br label %loop loop: %i = phi i32 [ 0, %entry ], [ %i1, %loop ] -; CHECK-NOT: DIVERGENT: %i = +; CHECK: DIVERGENT: %i = %i1 = add i32 %i, 1 %exit_cond = icmp sge i32 %i1, %laneid br i1 %exit_cond, label %loop_exit, label %loop @@ -126,7 +126,7 @@ br label %loop loop: %i = phi i32 [ 0, %entry ], [ %i1, %loop ] -; CHECK-NOT: DIVERGENT: %i = +; CHECK: DIVERGENT: %i = %i1 = add i32 %i, 1 %exit_cond = icmp sge i32 %i1, %tid br i1 %exit_cond, label %loop_exit, label %loop Index: llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -13,47 +13,51 @@ ; CHECK: ; %bb.0: ; %start ; CHECK-NEXT: v_readfirstlane_b32 s0, v0 ; CHECK-NEXT: s_mov_b32 m0, s0 -; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; CHECK-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b64 s[2:3], 0 -; CHECK-NEXT: ; implicit-def: $sgpr4_sgpr5 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1 +; CHECK-NEXT: ; implicit-def: $sgpr6_sgpr7 ; CHECK-NEXT: BB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], exec -; CHECK-NEXT: s_cmp_lt_u32 s0, 32 -; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_or_b64 s[6:7], s[6:7], exec +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_cmp_lt_u32 s2, 32 ; CHECK-NEXT: s_cbranch_scc0 BB0_5 ; CHECK-NEXT: ; %bb.2: ; %endif1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], -1 +; CHECK-NEXT: s_and_saveexec_b64 s[8:9], vcc +; CHECK-NEXT: s_xor_b64 s[8:9], exec, s[8:9] ; CHECK-NEXT: ; mask branch BB0_4 ; CHECK-NEXT: BB0_3: ; %endif2 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_add_i32 s0, s0, 1 -; CHECK-NEXT: s_xor_b64 s[4:5], exec, -1 +; CHECK-NEXT: s_add_i32 s2, s2, 1 +; CHECK-NEXT: s_xor_b64 s[6:7], exec, -1 ; CHECK-NEXT: BB0_4: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_mov_b64 s[6:7], 0 -; CHECK-NEXT: BB0_5: ; %Flow +; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_branch BB0_6 +; CHECK-NEXT: BB0_5: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: ; implicit-def: $sgpr2 +; CHECK-NEXT: BB0_6: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[4:5] -; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[2:3] -; CHECK-NEXT: s_mov_b64 s[2:3], s[8:9] +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[6:7] +; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[4:5] +; CHECK-NEXT: s_mov_b64 s[4:5], s[8:9] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9] ; CHECK-NEXT: s_cbranch_execnz BB0_1 -; CHECK-NEXT: ; %bb.6: ; %Flow2 +; CHECK-NEXT: ; %bb.7: ; %Flow2 ; CHECK-NEXT: s_or_b64 exec, exec, s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] -; CHECK-NEXT: ; mask branch BB0_8 -; CHECK-NEXT: BB0_7: ; %if1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; CHECK-NEXT: ; mask branch BB0_9 +; CHECK-NEXT: BB0_8: ; %if1 ; CHECK-NEXT: v_sqrt_f32_e32 v1, v0 -; CHECK-NEXT: BB0_8: ; %endloop -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] +; CHECK-NEXT: BB0_9: ; %endloop +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; CHECK-NEXT: s_endpgm ; this is the divergent branch with the condition not marked as divergent Index: llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -4,11 +4,17 @@ ; SI-LABEL: {{^}}i1_copy_from_loop: ; ; SI: ; %for.body -; SI: v_cmp_lt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], s{{[0-9+]}}, 4 +; SI: v_cmp_lt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 4 +; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec +; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]] + +; SI: ; %Flow1 +; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec ; SI: ; %Flow ; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec -; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec +; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec ; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]] ; SI: ; %for.end Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -107,6 +107,7 @@ ; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: s_mov_b64 [[BREAK]], -1{{$}} +; GCN: [[FLOW]]: ; %Flow ; GCN: ; %case0 ; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], @@ -115,7 +116,7 @@ ; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec ; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]] -; GCN: [[FLOW]]: ; %Flow4 +; GCN: ; %Flow4 ; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]] ; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]] ; GCN: s_andn2_b64 exec, exec, [[LEFT]]