Index: llvm/lib/Transforms/Scalar/JumpThreading.cpp =================================================================== --- llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -246,7 +246,7 @@ FunctionAnalysisManager &AM) { auto &TTI = AM.getResult(F); // Jump Threading has no sense for the targets with divergent CF - if (TTI.hasBranchDivergence()) + if (TTI.hasBranchDivergence(&F)) return PreservedAnalyses::all(); auto &TLI = AM.getResult(F); auto &LVI = AM.getResult(F); Index: llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3547,6 +3547,8 @@ return true; } + const Function *F = L.getHeader()->getParent(); + // Check whether we should continue with non-trivial conditions. // EnableNonTrivialUnswitch: Global variable that forces non-trivial // unswitching for testing and debugging. @@ -3559,12 +3561,12 @@ // branches even on targets that have divergence. // https://bugs.llvm.org/show_bug.cgi?id=48819 bool ContinueWithNonTrivial = - EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence()); + EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); if (!ContinueWithNonTrivial) return false; // Skip non-trivial unswitching for optsize functions. - if (L.getHeader()->getParent()->hasOptSize()) + if (F->hasOptSize()) return false; // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, Index: llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp =================================================================== --- llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -152,7 +152,7 @@ namespace llvm { bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) { - if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) { + if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence(&F)) { LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because " "TTI->hasBranchDivergence() is false.\n"); return false; Index: llvm/test/Transforms/JumpThreading/divergent-target-test.ll =================================================================== --- llvm/test/Transforms/JumpThreading/divergent-target-test.ll +++ llvm/test/Transforms/JumpThreading/divergent-target-test.ll @@ -45,3 +45,39 @@ ; UNIFORM: ret i32 %v2 ret i32 %B } + +; Check divergence check is skipped if there can't be divergence in +; the function. +define i32 @requires_single_lane_exec(i1 %cond) #0 { +; CHECK: requires_single_lane_exec + br i1 %cond, label %T1, label %F1 + +; CHECK-NOT: T1 +T1: + %v1 = call i32 @f1() + br label %Merge +; CHECK-NOT: F1 +F1: + %v2 = call i32 @f2() + br label %Merge +; CHECK-NOT: Merge +Merge: + %A = phi i1 [true, %T1], [false, %F1] + %B = phi i32 [%v1, %T1], [%v2, %F1] + br i1 %A, label %T2, label %F2 + +T2: +; CHECK: T2: +; CHECK: %v1 = call i32 @f1() +; CHECK: call void @f3() +; CHECK: ret i32 %v1 + call void @f3() + ret i32 %B +F2: +; CHECK: F2: +; CHECK: %v2 = call i32 @f2() +; CHECK: ret i32 %v2 + ret i32 %B +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,1" } Index: llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/nontrivial-unswitch-divergent-target.ll =================================================================== --- llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/nontrivial-unswitch-divergent-target.ll +++ llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/nontrivial-unswitch-divergent-target.ll @@ -123,50 +123,101 @@ entry: br label %loop_begin ; CHECK-NEXT: entry: -; CHECK-NEXT: br label %loop_begin +; CHECK-NEXT: br i1 %cond1, label %entry.split.us, label %entry.split loop_begin: br i1 %cond1, label %loop_a, label %loop_b -; CHECK: loop_begin: -; CHECK-NEXT: br i1 %cond1, label %loop_a, label %loop_b loop_a: - %unused.a = call i32 @a() + call i32 @a() br label %latch -; CHECK: loop_a: -; CHECK-NEXT: %unused.a = call i32 @a() -; CHECK-NEXT: br label %latch +; The 'loop_a' unswitched loop. +; +; CHECK: entry.split.us: +; CHECK-NEXT: br label %loop_begin.us +; +; CHECK: loop_begin.us: +; CHECK-NEXT: br label %loop_a.us +; +; CHECK: loop_a.us: +; CHECK-NEXT: call i32 @a() +; CHECK-NEXT: br label %latch.us +; +; CHECK: latch.us: +; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr +; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us +; +; CHECK: loop_exit.split.us: +; CHECK-NEXT: br label %loop_exit loop_b: br i1 %cond2, label %loop_b_a, label %loop_b_b -; CHECK: loop_b: -; CHECK-NEXT: br i1 %cond2, label %loop_b_a, label %loop_b_b +; The second unswitched condition. +; +; CHECK: entry.split: +; CHECK-NEXT: br i1 %cond2, label %entry.split.split.us, label %entry.split.split loop_b_a: - %unused.b = call i32 @b() + call i32 @b() br label %latch -; CHECK: loop_b_a: -; CHECK-NEXT: %unused.b = call i32 @b() -; CHECK-NEXT: br label %latch +; The 'loop_b_a' unswitched loop. +; +; CHECK: entry.split.split.us: +; CHECK-NEXT: br label %loop_begin.us1 +; +; CHECK: loop_begin.us1: +; CHECK-NEXT: br label %loop_b.us +; +; CHECK: loop_b.us: +; CHECK-NEXT: br label %loop_b_a.us +; +; CHECK: loop_b_a.us: +; CHECK-NEXT: call i32 @b() +; CHECK-NEXT: br label %latch.us2 +; +; CHECK: latch.us2: +; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr +; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us1, label %loop_exit.split.split.us +; +; CHECK: loop_exit.split.split.us: +; CHECK-NEXT: br label %loop_exit.split loop_b_b: - %unused.c = call i32 @c() + call i32 @c() br label %latch -; CHECK: loop_b_b: -; CHECK-NEXT: %unused.c = call i32 @c() -; CHECK-NEXT: br label %latch +; The 'loop_b_b' unswitched loop. +; +; CHECK: entry.split.split: +; CHECK-NEXT: br label %loop_begin +; +; CHECK: loop_begin: +; CHECK-NEXT: br label %loop_b +; +; CHECK: loop_b: +; CHECK-NEXT: br label %loop_b_b +; +; CHECK: loop_b_b: +; CHECK-NEXT: call i32 @c() +; CHECK-NEXT: br label %latch +; +; CHECK: latch: +; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr +; CHECK-NEXT: br i1 %[[V]], label %loop_begin, label %loop_exit.split.split +; +; CHECK: loop_exit.split.split: +; CHECK-NEXT: br label %loop_exit.split latch: %v = load i1, ptr %ptr br i1 %v, label %loop_begin, label %loop_exit -; CHECK: latch: -; CHECK-NEXT: %v = load i1, ptr %ptr -; CHECK-NEXT: br i1 %v, label %loop_begin, label %loop_exit loop_exit: ret void -; CHECK: loop_exit: -; CHECK-NEXT: ret void +; CHECK: loop_exit.split: +; CHECK-NEXT: br label %loop_exit +; +; CHECK: loop_exit: +; CHECK-NEXT: ret } attributes #0 = { "amdgpu-flat-work-group-size"="1,1" } Index: llvm/test/Transforms/SpeculativeExecution/single-lane-execution.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SpeculativeExecution/single-lane-execution.ll @@ -0,0 +1,25 @@ +; REQUIRES: amdgpu-registered-target +; RUN: opt -S -passes=speculative-execution -mtriple=amdgcn-- \ +; RUN: -spec-exec-only-if-divergent-target \ +; RUN: -spec-exec-max-speculation-cost 4 -spec-exec-max-not-hoisted 3 \ +; RUN: %s | FileCheck %s + +; Hoist in if-then pattern. +define void @skip_single_lane_ifThen() #0 { +; CHECK-LABEL: @skip_single_lane_ifThen( +; CHECK: br i1 true + +br i1 true, label %a, label %b +; CHECK: a: +; CHECK: %x = add i32 2, 3 +a: + %x = add i32 2, 3 +; CHECK: br label + br label %b +; CHECK: b: +b: +; CHECK: ret void + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,1" }