diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp --- a/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/llvm/lib/Transforms/Scalar/Sink.cpp @@ -178,9 +178,6 @@ static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI, AAResults &AA) { - // Can't sink anything out of a block that has less than two successors. - if (BB.getTerminator()->getNumSuccessors() <= 1) return false; - // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an // unreachable loop there may be nowhere to stop. diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -50,7 +50,8 @@ ; Make sure there's no verifier error with an undef source. ; SI-LABEL: {{^}}bitset_verifier_error: -; SI: s_bitset0_b32 s{{[0-9]+}}, 31 +; SI-NOT: %bb.1: +; SI: v_cmp_ge_f32_e64 define void @bitset_verifier_error() local_unnamed_addr #0 { bb: %i = call float @llvm.fabs.f32(float undef) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -207,8 +207,7 @@ ;GCN-LABEL: {{^}}s_buffer_load_index_across_bb: ;GCN-NOT: s_waitcnt; -;GCN: v_or_b32 -;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:8 define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) { main_body: %tmp = shl i32 %index, 4 @@ -224,10 +223,7 @@ ;GCN-LABEL: {{^}}s_buffer_load_index_across_bb_merged: ;GCN-NOT: s_waitcnt; -;GCN: v_or_b32 -;GCN: v_or_b32 -;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -;GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen +;GCN: buffer_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen offset:8 define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) { main_body: %tmp = shl i32 %index, 4 diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -126,7 +126,7 @@ ; There should be exact one folding on the same operand. ; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd -; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; CHECK-NOT: %bb.1: ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define void @no_extra_fold_on_same_opnd() #1 { entry: diff --git a/llvm/test/Transforms/Sink/single-succ.ll b/llvm/test/Transforms/Sink/single-succ.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Sink/single-succ.ll @@ -0,0 +1,17 @@ +; RUN: opt -S < %s -passes=sink | FileCheck %s + +; CHECK-LABEL: else: +; CHECK-NEXT: %l = load i32, i32* %a, align 4 +; CHECK-NEXT: ret i32 %l + +define i32 @single_succ(i1 %b, i32* %a) { +entry: + %l = load i32, i32* %a, align 4 + br label %if +if: + br i1 %b, label %then, label %else +then: + ret i32 42 +else: + ret i32 %l +}