Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -396,6 +396,14 @@
   /// incurs significant execution cost.
   bool isLoweredToCall(const Function *F) const;
 
+  ///
+  /// \brief isExpensiveForFolding check if instruction can be replaced
+  /// with conditional instruction (csel/cset etc), if not - then instruction is
+  /// considered as expensive
+  /// \param I - instructions for checking
+  /// \return true if it cannot be replaced with one conditional instruction
+  bool isExpensiveForFolding(const Instruction& I) const;
+
   struct LSRCost {
     /// TODO: Some of these could be merged. Also, a lexical ordering
     /// isn't always optimal.
@@ -1178,6 +1186,7 @@
                                           Intrinsic::ID IID) const = 0;
   virtual bool rewriteIntrinsicWithAddressSpace(
     IntrinsicInst *II, Value *OldV, Value *NewV) const = 0;
+  virtual bool isExpensiveForFolding(const Instruction& I) const = 0;
   virtual bool isLoweredToCall(const Function *F) = 0;
   virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
                                        UnrollingPreferences &UP) = 0;
@@ -1428,6 +1437,10 @@
     return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
   }
 
+  bool isExpensiveForFolding(const Instruction& I) const override {
+    return Impl.isExpensiveForFolding(I);
+  }
+
   bool isLoweredToCall(const Function *F) override {
     return Impl.isLoweredToCall(F);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -166,6 +166,11 @@
     return false;
   }
 
+  bool isExpensiveForFolding(const Instruction& I) const {
+    (void)I;
+    return false;
+  }
+
   bool isLoweredToCall(const Function *F) {
     assert(F && "A concrete function must be provided to this routine.");
 
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -247,6 +247,10 @@
   return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
 }
 
+bool TargetTransformInfo::isExpensiveForFolding(const Instruction& I) const {
+  return TTIImpl->isExpensiveForFolding(I);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -172,6 +172,17 @@
     return 2;
   }
 
+  bool isExpensiveForFolding(const Instruction& I) const {
+    switch (I.getOpcode()) {
+    case Instruction::Or:
+    case Instruction::And:
+    case Instruction::Xor:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -127,6 +127,11 @@
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<bool> CheckSpeculationCost(
+    "check-speculation-cost", cl::Hidden, cl::init(true),
+    cl::desc("When merging conditional stores, add additional cost to instructions "
+             "which cannot be represented by one conditional instruction"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -2945,7 +2950,8 @@
                                            BasicBlock *QTB, BasicBlock *QFB,
                                            BasicBlock *PostBB, Value *Address,
                                            bool InvertPCond, bool InvertQCond,
-                                           const DataLayout &DL) {
+                                           const DataLayout &DL,
+                                           const TargetTransformInfo &TTI) {
   auto IsaBitcastOfPointerType = [](const Instruction &I) {
     return Operator::getOpcode(&I) == Instruction::BitCast &&
            I.getType()->isPointerTy();
@@ -2963,8 +2969,14 @@
     for (auto &I : BB->instructionsWithoutDebug()) {
       // Cheap instructions viable for folding.
       if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) ||
-          isa<StoreInst>(I))
+          isa<StoreInst>(I)) {
         ++N;
+
+        if (CheckSpeculationCost && TTI.isExpensiveForFolding(I)) {
+          ++N;
+        }
+      }
+
       // Free instructions.
       else if (I.isTerminator() || IsaBitcastOfPointerType(I))
         continue;
@@ -3098,7 +3110,7 @@
 }
 
 static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
-                                   const DataLayout &DL) {
+                                   const DataLayout &DL, const TargetTransformInfo &TTI) {
   // The intention here is to find diamonds or triangles (see below) where each
   // conditional block contains a store to the same address. Both of these
   // stores are conditional, so they can't be unconditionally sunk. But it may
@@ -3200,7 +3212,7 @@
   bool Changed = false;
   for (auto *Address : CommonAddresses)
     Changed |= mergeConditionalStoreToAddress(
-        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL);
+        PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI);
   return Changed;
 }
 
@@ -3209,7 +3221,7 @@
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
 static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
-                                           const DataLayout &DL) {
+                                           const DataLayout &DL, const TargetTransformInfo& TTI) {
   assert(PBI->isConditional() && BI->isConditional());
   BasicBlock *BB = BI->getParent();
 
@@ -3265,7 +3277,7 @@
   // If both branches are conditional and both contain stores to the same
   // address, remove the stores from the conditionals and create a conditional
   // merged store at the end.
-  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL))
+  if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI))
     return true;
 
   // If this is a conditional branch in an empty block, and if any
@@ -5937,7 +5949,7 @@
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
-        if (SimplifyCondBranchToCondBranch(PBI, BI, DL))
+        if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI))
           return requestResimplify();
 
   // Look for diamond patterns.
@@ -5945,7 +5957,7 @@
     if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
       if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
         if (PBI != BI && PBI->isConditional())
-          if (mergeConditionalStores(PBI, BI, DL))
+          if (mergeConditionalStores(PBI, BI, DL, TTI))
             return requestResimplify();
 
   return false;
Index: llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
@@ -0,0 +1,104 @@
+; RUN: opt < %s -mtriple=aarch64-linux-gnu -simplifycfg -check-speculation-cost=true -S >%t
+; RUN: FileCheck %s < %t
+; ModuleID = 'do_select.c'
+
+%struct.fd_set_bits = type { i64*, i64*, i64*, i64*, i64*, i64*, i32, i8* }
+
+@g_max_i = common dso_local global i64 0, align 8
+@gv_fds = common dso_local global %struct.fd_set_bits* null, align 8
+
+; Function Attrs: nofree noinline norecurse nounwind
+define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64 %ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #0 {
+entry:
+  %cmp52 = icmp sgt i32 %max_iters_count, 0
+  br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %and13 = and i64 %mask, 780
+  %tobool14 = icmp eq i64 %and13, 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup5
+  %retval1.1.lcssa.lcssa = phi i32 [ %retval1.1.lcssa, %for.cond.cleanup5 ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %retval1.0.lcssa
+
+for.body:                                         ; preds = %for.cond.cleanup5, %for.body.lr.ph
+  %retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa, %for.cond.cleanup5 ]
+  %k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ]
+  %0 = load volatile %struct.fd_set_bits*, %struct.fd_set_bits** @gv_fds, align 8
+  %res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 3
+  %1 = load i64*, i64** %res_in, align 8
+  %2 = load volatile i64, i64* @g_max_i, align 8
+  %cmp348 = icmp eq i64 %2, 0
+  br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph
+
+for.cond8.preheader.lr.ph:                        ; preds = %for.body
+  %proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 7
+  br label %for.cond8.preheader
+
+for.cond8.preheader:                              ; preds = %for.cond8.preheader.lr.ph, %for.cond.cleanup11
+  %indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup11 ]
+  %rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr, %for.cond.cleanup11 ]
+  %retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [ %retval1.3.lcssa, %for.cond.cleanup11 ]
+  br label %for.body12
+
+for.cond.cleanup5.loopexit:                       ; preds = %for.cond.cleanup11
+  %retval1.3.lcssa.lcssa = phi i32 [ %retval1.3.lcssa, %for.cond.cleanup11 ]
+  br label %for.cond.cleanup5
+
+for.cond.cleanup5:                                ; preds = %for.cond.cleanup5.loopexit, %for.body
+  %retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa.lcssa, %for.cond.cleanup5.loopexit ]
+  %inc23 = add nuw nsw i32 %k.053, 1
+  %exitcond56 = icmp eq i32 %inc23, %max_iters_count
+  br i1 %exitcond56, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup11:                               ; preds = %for.inc
+  %retval1.3.lcssa = phi i32 [ %retval1.3, %for.inc ]
+  %res_in7.1.lcssa = phi i64 [ %res_in7.1, %for.inc ]
+  store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1
+  %3 = load volatile i64, i64* @g_max_i, align 8
+  %cmp3 = icmp ugt i64 %3, %indvars.iv.next
+  br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5.loopexit
+
+for.body12:                                       ; preds = %for.inc, %for.cond8.preheader
+  %j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %for.inc ]
+  %res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %for.inc ]
+  %bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %for.inc ]
+  %retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3, %for.inc ]
+  %and = and i64 %bit.044, %in
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body12
+; CHECK-LABEL: if.then:
+; CHECK-NEXT:   %or = or i64 %res_in7.046, %bit.044
+; CHECK-NEXT:   %inc = add nsw i32 %retval1.243, 1
+; CHECK-NEXT:   store i8* null, i8** %proc, align 8
+  %or = or i64 %res_in7.046, %bit.044
+  %inc = add nsw i32 %retval1.243, 1
+  store i8* null, i8** %proc, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %for.body12, %if.then
+  %retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ]
+  %res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ]
+  br i1 %tobool14, label %for.inc, label %if.then15
+
+if.then15:                                        ; preds = %if.end
+; CHECK-LABEL: if.then15:
+; CHECK-NEXT:   store i8* null, i8** %proc, align 8
+  store i8* null, i8** %proc, align 8
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end, %if.then15
+  %inc18 = add nuw nsw i32 %j.047, 1
+  %shl = shl i64 %bit.044, 1
+  %exitcond = icmp eq i32 %inc18, 64
+  br i1 %exitcond, label %for.cond.cleanup11, label %for.body12
+}