diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h --- a/llvm/include/llvm/Analysis/CodeMetrics.h +++ b/llvm/include/llvm/Analysis/CodeMetrics.h @@ -15,6 +15,7 @@ #define LLVM_ANALYSIS_CODEMETRICS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/InstructionCost.h" namespace llvm { class AssumptionCache; @@ -47,14 +48,14 @@ /// True if this function calls alloca (in the C sense). bool usesDynamicAlloca = false; - /// Number of instructions in the analyzed blocks. - unsigned NumInsts = false; + /// Code size cost of the analyzed blocks. + InstructionCost NumInsts = 0; /// Number of analyzed blocks. unsigned NumBlocks = false; /// Keeps track of basic block code size estimates. - DenseMap NumBBInsts; + DenseMap NumBBInsts; /// Keep track of the number of calls to 'big' functions. unsigned NumCalls = false; diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -117,13 +117,6 @@ const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl &EphValues, bool PrepareForLTO) { ++NumBlocks; - // Use a proxy variable for NumInsts of type InstructionCost, so that it can - // use InstructionCost's arithmetic properties such as saturation when this - // feature is added to InstructionCost. - // When storing the value back to NumInsts, we can assume all costs are Valid - // because the IR should not contain any nodes that cannot be costed. If that - // happens the cost-model is broken. - InstructionCost NumInstsProxy = NumInsts; InstructionCost NumInstsBeforeThisBB = NumInsts; for (const Instruction &I : *BB) { // Skip ephemeral values. @@ -183,8 +176,7 @@ if (InvI->cannotDuplicate()) notDuplicatable = true; - NumInstsProxy += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); - NumInsts = *NumInstsProxy.getValue(); + NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); } if (isa(BB->getTerminator())) @@ -204,6 +196,6 @@ notDuplicatable |= isa(BB->getTerminator()); // Remember NumInsts for this BB. - InstructionCost NumInstsThisBB = NumInstsProxy - NumInstsBeforeThisBB; - NumBBInsts[BB] = *NumInstsThisBB.getValue(); + InstructionCost NumInstsThisBB = NumInsts - NumInstsBeforeThisBB; + NumBBInsts[BB] = NumInstsThisBB; } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -550,9 +550,9 @@ // shouldn't specialize it. Set the specialization cost to Invalid. // Or if the lines of codes implies that this function is easy to get // inlined so that we shouldn't specialize it. - if (Metrics.notDuplicatable || + if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() || (!ForceFunctionSpecialization && - Metrics.NumInsts < SmallFunctionThreshold)) { + *Metrics.NumInsts.getValue() < SmallFunctionThreshold)) { InstructionCost C{}; C.setInvalid(); return C; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -828,6 +828,16 @@ }); return false; } + + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " + << "instructions with invalid cost.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch) + << "Contains instructions with invalid cost."; + }); + return false; + } } unsigned DuplicationCost = 0; @@ -841,7 +851,7 @@ // using binary search, hence the LogBase2(). unsigned CondBranches = APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); - DuplicationCost = Metrics.NumInsts / CondBranches; + DuplicationCost = *Metrics.NumInsts.getValue() / CondBranches; } else { // Compared with jump tables, the DFA optimizer removes an indirect branch // on each loop iteration, thus making branch prediction more precise. The @@ -849,7 +859,7 @@ // predictor to make a mistake, and the more benefit there is in the DFA // optimizer. Thus, the more branch targets there are, the lower is the // cost of the DFA opt. - DuplicationCost = Metrics.NumInsts / JumpTableSize; + DuplicationCost = *Metrics.NumInsts.getValue() / JumpTableSize; } LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -299,7 +299,11 @@ } Metrics.analyzeBasicBlock(BB, *TTI, EphValues); } - unsigned LoopSize = Metrics.NumInsts; + + if (!Metrics.NumInsts.isValid()) + return MadeChange; + + unsigned LoopSize = *Metrics.NumInsts.getValue(); if (!LoopSize) LoopSize = 1; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -674,7 +674,9 @@ NotDuplicatable = Metrics.notDuplicatable; Convergent = Metrics.convergent; - unsigned LoopSize = Metrics.NumInsts; + // FIXME: This will crash for invalid InstructionCost, we should update the + // callers to gracefully bailout in this case. + unsigned LoopSize = *Metrics.NumInsts.getValue(); // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -310,7 +310,13 @@ L->dump()); return Rotated; } - if (Metrics.NumInsts > MaxHeaderSize) { + if (!Metrics.NumInsts.isValid()) { + LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions" + " with invalid cost: "; + L->dump()); + return Rotated; + } + if (*Metrics.NumInsts.getValue() > MaxHeaderSize) { LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains " << Metrics.NumInsts << " instructions, which is more than the threshold (" diff --git a/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll b/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll --- a/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll +++ b/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll @@ -76,3 +76,46 @@ ret void } +; This demonstrates a case where a) loop rotate needs a cost estimate to +; know if rotation is profitable, and b) there is no cost estimate available +; due to invalid costs in the loop. We can't rotate this loop. +define void @invalid_dup_required(* %p) nounwind ssp { +; CHECK-LABEL: @invalid_dup_required( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] +; CHECK-NEXT: [[A:%.*]] = load , * [[P:%.*]], align 1 +; CHECK-NEXT: [[B:%.*]] = add [[A]], [[A]] +; CHECK-NEXT: store [[B]], * [[P]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: call void @f() +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %a = load , * %p + %b = add %a, %a + store %b, * %p + %cmp = icmp slt i32 %i.0, 100 + br i1 %cmp, label %for.body, label %for.end + + +for.body: ; preds = %for.cond + call void @f() + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare void @f()