Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -101,6 +101,10 @@
              "to speculatively execute to fold a 2-entry PHI node into a "
              "select (default = 4)"));
 
+static cl::opt<bool> CheckDomBlockInstructionsCount(
+    "simplifycfg-db-instr-count", cl::Hidden, cl::init(false),
+    cl::desc("Do not merge BBs if domblock already has more than phi-node-folding-threshold instructions"));
+
 static cl::opt<bool> DupRet(
     "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
     cl::desc("Duplicate return instructions into unconditional branches"));
@@ -2413,6 +2417,24 @@
                     << "  T: " << IfTrue->getName()
                     << "  F: " << IfFalse->getName() << "\n");
 
+  // We need to be sure, that DomBlock has
+  // enough room for new instructions
+  // First add cost of Select instruction, that will be added to this block
+  // (this cost is equal to number of phi nodes in BB)
+  unsigned Cost = NumPhis;
+
+  if (CheckDomBlockInstructionsCount) {
+    for (const auto& Instr : *DomBlock) {
+      if (!isa<BranchInst>(&Instr))
+        Cost += TTI.getUserCost(&Instr);
+    }
+
+    if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) {
+      // DomBlock already too large
+      return false;
+    }
+  }
+
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
Index: llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s -simplifycfg-db-instr-count=false -mtriple=aarch64-linux-gnu -simplifycfg -S | FileCheck %s --check-prefix=CHECK-MERGE
+; RUN: opt < %s -simplifycfg-db-instr-count=true  -mtriple=aarch64-linux-gnu -simplifycfg -S | FileCheck %s --check-prefix=CHECK-SEPARATE
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+%struct.ptr_wrapper = type { i64*, i8* }
+
+; When merge is enabled, 'if.then' and 'if.end' should be merged to 'for.body'
+; CHECK-MERGE-LABEL: for.body:
+; CHECK-MERGE-NOT: if.then:
+; CHECK-MERGE-NOT: if.end:
+; And two consequtive instructions 'select' are generated
+; CHECK-MERGE: select i1 %tobool
+; CHECK-MERGE-NEXT: select i1 %tobool
+
+; When merge is disabled, instruction 'select' should not be generated
+; CHECK-SEPARATE-LABEL: for.body:
+; CHECK-SEPARATE-NOT: select
+; CHECK-SEPARATE-LABEL: if.then:
+; CHECK-SEPARATE-LABEL: if.end:
+
+; Function Attrs: nofree noinline norecurse nounwind
+define dso_local i32 @test_func(%struct.ptr_wrapper* nocapture %wrapper, i32 %def_mask, i32 %bit_to_compare, i32 %bit, i32 %mask) local_unnamed_addr #0 {
+entry:
+  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %wrapper, i64 0, i32 1
+  %and3 = and i32 %mask, %def_mask
+  %tobool4 = icmp eq i32 %and3, 0
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end7
+  %retval1.1.lcssa = phi i32 [ %retval1.1, %if.end7 ]
+  %res_in.1.lcssa = phi i32 [ %res_in.1, %if.end7 ]
+  %conv = zext i32 %res_in.1.lcssa to i64
+  %res_in8 = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %wrapper, i64 0, i32 0
+  %0 = load i64*, i64** %res_in8, align 8, !tbaa !2
+  store i64 %conv, i64* %0, align 8, !tbaa !7
+  ret i32 %retval1.1.lcssa
+
+for.body:                                         ; preds = %if.end7, %entry
+  %j.022 = phi i32 [ 0, %entry ], [ %inc, %if.end7 ]
+  %res_in.021 = phi i32 [ 0, %entry ], [ %res_in.1, %if.end7 ]
+  %retval1.020 = phi i32 [ 0, %entry ], [ %retval1.1, %if.end7 ]
+  %bit.addr.019 = phi i32 [ %bit, %entry ], [ %shl, %if.end7 ]
+  %inc = add nuw nsw i32 %j.022, 1
+  %shl = shl i32 %bit.addr.019, 1
+  %and = and i32 %shl, %bit_to_compare
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+if.then:                                          ; preds = %for.body
+  %or = or i32 %res_in.021, %shl
+  %inc2 = add nsw i32 %retval1.020, 1
+  store i8* null, i8** %proc, align 8, !tbaa !9
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %retval1.1 = phi i32 [ %inc2, %if.then ], [ %retval1.020, %for.body ]
+  %res_in.1 = phi i32 [ %or, %if.then ], [ %res_in.021, %for.body ]
+  br i1 %tobool4, label %if.end7, label %if.then5
+
+if.then5:                                         ; preds = %if.end
+  store i8* null, i8** %proc, align 8, !tbaa !9
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end, %if.then5
+  %exitcond = icmp eq i32 %inc, 64
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nofree noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (http://gitlab-msc-rd.rnd.huawei.com/rus-os-team/compilers/llvm/llvm-project.git 65c11cb4cb258372187bc5a959312b62ac705f94)"}
+!2 = !{!3, !4, i64 0}
+!3 = !{!"", !4, i64 0, !4, i64 8}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long", !5, i64 0}
+!9 = !{!3, !4, i64 8}