Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -396,6 +396,14 @@ /// incurs significant execution cost. bool isLoweredToCall(const Function *F) const; + /// + /// \brief isExpensiveForFolding check if instruction can be replaced + /// with conditional instruction (csel/cset etc), if not - then instruction is + /// considered as expensive + /// \param I - instructions for checking + /// \return true if it cannot be replaced with one conditional instruction + bool isExpensiveForFolding(const Instruction& I) const; + struct LSRCost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. @@ -1178,6 +1186,7 @@ Intrinsic::ID IID) const = 0; virtual bool rewriteIntrinsicWithAddressSpace( IntrinsicInst *II, Value *OldV, Value *NewV) const = 0; + virtual bool isExpensiveForFolding(const Instruction& I) const = 0; virtual bool isLoweredToCall(const Function *F) = 0; virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP) = 0; @@ -1428,6 +1437,10 @@ return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV); } + bool isExpensiveForFolding(const Instruction& I) const override { + return Impl.isExpensiveForFolding(I); + } + bool isLoweredToCall(const Function *F) override { return Impl.isLoweredToCall(F); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -166,6 +166,11 @@ return false; } + bool isExpensiveForFolding(const Instruction& I) const { + (void)I; + return false; + } + bool isLoweredToCall(const Function *F) { assert(F && "A concrete function must be provided to this routine."); Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -247,6 +247,10 @@ return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } +bool TargetTransformInfo::isExpensiveForFolding(const Instruction& I) const { + return TTIImpl->isExpensiveForFolding(I); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -172,6 +172,17 @@ return 2; } + bool isExpensiveForFolding(const Instruction& I) const { + switch (I.getOpcode()) { + case Instruction::Or: + case Instruction::And: + case Instruction::Xor: + return true; + default: + return false; + } + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp =================================================================== --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,11 @@ cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt CheckSpeculationCost( + "check-speculation-cost", cl::Hidden, cl::init(true), + cl::desc("When merging conditional stores, add additional cost to instructions " + "which cannot be represented by one conditional instruction")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -2945,7 +2950,8 @@ BasicBlock *QTB, BasicBlock *QFB, BasicBlock *PostBB, Value *Address, bool InvertPCond, bool InvertQCond, - const DataLayout &DL) { + const DataLayout &DL, + const TargetTransformInfo &TTI) { auto IsaBitcastOfPointerType = [](const Instruction &I) { return Operator::getOpcode(&I) == Instruction::BitCast && I.getType()->isPointerTy(); @@ -2963,8 +2969,14 @@ for (auto &I : BB->instructionsWithoutDebug()) { // Cheap instructions viable for folding. if (isa(I) || isa(I) || - isa(I)) + isa(I)) { ++N; + + if (CheckSpeculationCost && TTI.isExpensiveForFolding(I)) { + ++N; + } + } + // Free instructions. else if (I.isTerminator() || IsaBitcastOfPointerType(I)) continue; @@ -3098,7 +3110,7 @@ } static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI, - const DataLayout &DL) { + const DataLayout &DL, const TargetTransformInfo &TTI) { // The intention here is to find diamonds or triangles (see below) where each // conditional block contains a store to the same address. Both of these // stores are conditional, so they can't be unconditionally sunk. But it may @@ -3200,7 +3212,7 @@ bool Changed = false; for (auto *Address : CommonAddresses) Changed |= mergeConditionalStoreToAddress( - PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL); + PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI); return Changed; } @@ -3209,7 +3221,7 @@ /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, - const DataLayout &DL) { + const DataLayout &DL, const TargetTransformInfo& TTI) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -3265,7 +3277,7 @@ // If both branches are conditional and both contain stores to the same // address, remove the stores from the conditionals and create a conditional // merged store at the end. - if (MergeCondStores && mergeConditionalStores(PBI, BI, DL)) + if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI)) return true; // If this is a conditional branch in an empty block, and if any @@ -5937,7 +5949,7 @@ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI)) return requestResimplify(); // Look for diamond patterns. @@ -5945,7 +5957,7 @@ if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) if (BranchInst *PBI = dyn_cast(PrevBB->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (mergeConditionalStores(PBI, BI, DL)) + if (mergeConditionalStores(PBI, BI, DL, TTI)) return requestResimplify(); return false; Index: llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll @@ -0,0 +1,104 @@ +; RUN: opt < %s -mtriple=aarch64-linux-gnu -simplifycfg -check-speculation-cost=true -S >%t +; RUN: FileCheck %s < %t +; ModuleID = 'do_select.c' + +%struct.fd_set_bits = type { i64*, i64*, i64*, i64*, i64*, i64*, i32, i8* } + +@g_max_i = common dso_local global i64 0, align 8 +@gv_fds = common dso_local global %struct.fd_set_bits* null, align 8 + +; Function Attrs: nofree noinline norecurse nounwind +define dso_local i32 @do_select(i32 %max_iters_count, i64 %in, i64 %out, i64 %ex, i64 %bit_init_val, i64 %mask) local_unnamed_addr #0 { +entry: + %cmp52 = icmp sgt i32 %max_iters_count, 0 + br i1 %cmp52, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %and13 = and i64 %mask, 780 + %tobool14 = icmp eq i64 %and13, 0 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup5 + %retval1.1.lcssa.lcssa = phi i32 [ %retval1.1.lcssa, %for.cond.cleanup5 ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1.lcssa.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %retval1.0.lcssa + +for.body: ; preds = %for.cond.cleanup5, %for.body.lr.ph + %retval1.054 = phi i32 [ 0, %for.body.lr.ph ], [ %retval1.1.lcssa, %for.cond.cleanup5 ] + %k.053 = phi i32 [ 0, %for.body.lr.ph ], [ %inc23, %for.cond.cleanup5 ] + %0 = load volatile %struct.fd_set_bits*, %struct.fd_set_bits** @gv_fds, align 8 + %res_in = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 3 + %1 = load i64*, i64** %res_in, align 8 + %2 = load volatile i64, i64* @g_max_i, align 8 + %cmp348 = icmp eq i64 %2, 0 + br i1 %cmp348, label %for.cond.cleanup5, label %for.cond8.preheader.lr.ph + +for.cond8.preheader.lr.ph: ; preds = %for.body + %proc = getelementptr inbounds %struct.fd_set_bits, %struct.fd_set_bits* %0, i64 0, i32 7 + br label %for.cond8.preheader + +for.cond8.preheader: ; preds = %for.cond8.preheader.lr.ph, %for.cond.cleanup11 + %indvars.iv = phi i64 [ 0, %for.cond8.preheader.lr.ph ], [ %indvars.iv.next, %for.cond.cleanup11 ] + %rinp.050 = phi i64* [ %1, %for.cond8.preheader.lr.ph ], [ %incdec.ptr, %for.cond.cleanup11 ] + %retval1.149 = phi i32 [ %retval1.054, %for.cond8.preheader.lr.ph ], [ %retval1.3.lcssa, %for.cond.cleanup11 ] + br label %for.body12 + +for.cond.cleanup5.loopexit: ; preds = %for.cond.cleanup11 + %retval1.3.lcssa.lcssa = phi i32 [ %retval1.3.lcssa, %for.cond.cleanup11 ] + br label %for.cond.cleanup5 + +for.cond.cleanup5: ; preds = %for.cond.cleanup5.loopexit, %for.body + %retval1.1.lcssa = phi i32 [ %retval1.054, %for.body ], [ %retval1.3.lcssa.lcssa, %for.cond.cleanup5.loopexit ] + %inc23 = add nuw nsw i32 %k.053, 1 + %exitcond56 = icmp eq i32 %inc23, %max_iters_count + br i1 %exitcond56, label %for.cond.cleanup.loopexit, label %for.body + +for.cond.cleanup11: ; preds = %for.inc + %retval1.3.lcssa = phi i32 [ %retval1.3, %for.inc ] + %res_in7.1.lcssa = phi i64 [ %res_in7.1, %for.inc ] + store i64 %res_in7.1.lcssa, i64* %rinp.050, align 8 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %incdec.ptr = getelementptr inbounds i64, i64* %rinp.050, i64 1 + %3 = load volatile i64, i64* @g_max_i, align 8 + %cmp3 = icmp ugt i64 %3, %indvars.iv.next + br i1 %cmp3, label %for.cond8.preheader, label %for.cond.cleanup5.loopexit + +for.body12: ; preds = %for.inc, %for.cond8.preheader + %j.047 = phi i32 [ 0, %for.cond8.preheader ], [ %inc18, %for.inc ] + %res_in7.046 = phi i64 [ 0, %for.cond8.preheader ], [ %res_in7.1, %for.inc ] + %bit.044 = phi i64 [ %bit_init_val, %for.cond8.preheader ], [ %shl, %for.inc ] + %retval1.243 = phi i32 [ %retval1.149, %for.cond8.preheader ], [ %retval1.3, %for.inc ] + %and = and i64 %bit.044, %in + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %for.body12 +; CHECK-LABEL: if.then: +; CHECK-NEXT: %or = or i64 %res_in7.046, %bit.044 +; CHECK-NEXT: %inc = add nsw i32 %retval1.243, 1 +; CHECK-NEXT: store i8* null, i8** %proc, align 8 + %or = or i64 %res_in7.046, %bit.044 + %inc = add nsw i32 %retval1.243, 1 + store i8* null, i8** %proc, align 8 + br label %if.end + +if.end: ; preds = %for.body12, %if.then + %retval1.3 = phi i32 [ %inc, %if.then ], [ %retval1.243, %for.body12 ] + %res_in7.1 = phi i64 [ %or, %if.then ], [ %res_in7.046, %for.body12 ] + br i1 %tobool14, label %for.inc, label %if.then15 + +if.then15: ; preds = %if.end +; CHECK-LABEL: if.then15: +; CHECK-NEXT: store i8* null, i8** %proc, align 8 + store i8* null, i8** %proc, align 8 + br label %for.inc + +for.inc: ; preds = %if.end, %if.then15 + %inc18 = add nuw nsw i32 %j.047, 1 + %shl = shl i64 %bit.044, 1 + %exitcond = icmp eq i32 %inc18, 64 + br i1 %exitcond, label %for.cond.cleanup11, label %for.body12 +}