diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -387,6 +387,7 @@ bool simplifyCallSite(Function *F, CallBase &Call); template bool simplifyInstruction(Instruction &I, Callable Evaluate); + bool simplifyIntrinsicCallIsConstant(CallBase &CB); ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); /// Return true if the given argument to the function being considered for @@ -1531,6 +1532,27 @@ return true; } +/// Try to simplify a call to llvm.is.constant. +/// +/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since +/// we expect calls of this specific intrinsic to be infrequent. +/// +/// FIXME: If we knew CB's parent's caller, we might be able to determine +/// whether inlining CB's parent into CB's parent's caller would change how the +/// call to llvm.is.constant would evaluate. The member CandidateCall of +/// CallAnalyzer is CB's parent's caller. +bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) { + Value *Arg = CB.getArgOperand(0); + auto *C = dyn_cast(Arg); + + if (!C) + C = dyn_cast_or_null(SimplifiedValues.lookup(Arg)); + + Type *RT = CB.getFunctionType()->getReturnType(); + SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0); + return true; +} + bool CallAnalyzer::visitBitCast(BitCastInst &I) { // Propagate constants through bitcasts. if (simplifyInstruction(I, [&](SmallVectorImpl &COps) { @@ -2154,6 +2176,8 @@ if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0))) SROAArgValues[II] = SROAArg; return true; + case Intrinsic::is_constant: + return simplifyIntrinsicCallIsConstant(Call); } } diff --git a/llvm/test/Transforms/Inline/call-intrinsic-is-constant.ll b/llvm/test/Transforms/Inline/call-intrinsic-is-constant.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/call-intrinsic-is-constant.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -passes=inline -S -inline-threshold=20 | FileCheck %s + +; In this test we basically have the following C code: + +; long hweight_long_w; +; int hweight_long (void) { +; if (__builtin_constant_p(w)) +; // lot's of code that would fold away if w was Constant, but no amount of +; // inlining will make it so. +; else +; // a little bit of code. +; } +; int __nodes_weight (void) { hweight_long(); } +; int amd_numa_init (void) { __nodes_weight(); } + +; The point of this test is that __builtin_constant_p (which is lowered to a +; call to the intrinsic @llvm.is.constant.i64) does not hinder inlining +; hweight_long all the way up into amd_numa_init. + +@hweight_long_w = external dso_local global i64, align 8 + +; Testing the InlineCost of the call to @llvm.is.constant.i64. +; Do not change the linkage of @hweight_long; that will give it a severe +; discount in cost (LastCallToStaticBonus). +define i32 @hweight_long() { +; CHECK-LABEL: @hweight_long( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @hweight_long_w, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.is.constant.i64(i64 [[TMP0]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +; CHECK: cond.true: +; CHECK-NEXT: [[AND:%.*]] = and i64 [[TMP0]], 1 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[AND]], [[TMP0]] +; CHECK-NEXT: br label [[COND_END:%.*]] +; CHECK: cond.false: +; CHECK-NEXT: [[CALL:%.*]] = call i32 (i64, ...) bitcast (i32 (...)* @__arch_hweight64 to i32 (i64, ...)*)(i64 [[TMP0]]) +; CHECK-NEXT: [[CONV286:%.*]] = sext i32 [[CALL]] to i64 +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[ADD]], [[COND_TRUE]] ], [ [[CONV286]], [[COND_FALSE]] ] +; CHECK-NEXT: [[CONV287:%.*]] = trunc i64 [[COND]] to i32 +; CHECK-NEXT: ret i32 [[CONV287]] +; +entry: + %0 = load i64, i64* @hweight_long_w, align 8 + %1 = call i1 @llvm.is.constant.i64(i64 %0) + br i1 %1, label %cond.true, label %cond.false + +cond.true: + %and = and i64 %0, 1 + %add = add nsw i64 %and, %0 + br label %cond.end + +cond.false: + %call = call i32 (i64, ...) bitcast (i32 (...)* @__arch_hweight64 to i32 (i64, ...)*)(i64 %0) + %conv286 = sext i32 %call to i64 + br label %cond.end + +cond.end: + %cond = phi i64 [ %add, %cond.true ], [ %conv286, %cond.false ] + %conv287 = trunc i64 %cond to i32 + ret i32 %conv287 +} + +; Do not change the linkage of @__nodes_weight; that will give it a severe +; discount in cost (LastCallToStaticBonus). +define i32 @__nodes_weight() { +; CHECK-LABEL: @__nodes_weight( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[__TRANS_TMP_1:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* @hweight_long_w, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.is.constant.i64(i64 [[TMP0]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_TRUE_I:%.*]], label [[COND_FALSE_I:%.*]] +; CHECK: cond.true.i: +; CHECK-NEXT: [[AND_I:%.*]] = and i64 [[TMP0]], 1 +; CHECK-NEXT: [[ADD_I:%.*]] = add nsw i64 [[AND_I]], [[TMP0]] +; CHECK-NEXT: br label [[HWEIGHT_LONG_EXIT:%.*]] +; CHECK: cond.false.i: +; CHECK-NEXT: [[CALL_I:%.*]] = call i32 (i64, ...) bitcast (i32 (...)* @__arch_hweight64 to i32 (i64, ...)*)(i64 [[TMP0]]) +; CHECK-NEXT: [[CONV286_I:%.*]] = sext i32 [[CALL_I]] to i64 +; CHECK-NEXT: br label [[HWEIGHT_LONG_EXIT]] +; CHECK: hweight_long.exit: +; CHECK-NEXT: [[COND_I:%.*]] = phi i64 [ [[ADD_I]], [[COND_TRUE_I]] ], [ [[CONV286_I]], [[COND_FALSE_I]] ] +; CHECK-NEXT: [[CONV287_I:%.*]] = trunc i64 [[COND_I]] to i32 +; CHECK-NEXT: store i32 [[CONV287_I]], i32* [[__TRANS_TMP_1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[__TRANS_TMP_1]], align 4 +; CHECK-NEXT: ret i32 [[TMP2]] +; +entry: + %__trans_tmp_1 = alloca i32, align 4 + %call = call i32 @hweight_long() + store i32 %call, i32* %__trans_tmp_1, align 4 + %0 = load i32, i32* %__trans_tmp_1, align 4 + ret i32 %0 +} + +; The real goal of this test is that @hweight_long gets fully inlined here. +define dso_local i32 @amd_numa_init() { +; CHECK-LABEL: @amd_numa_init( +; CHECK-NEXT: [[__TRANS_TMP_1_I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[__TRANS_TMP_1_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* @hweight_long_w, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.is.constant.i64(i64 [[TMP2]]) +; CHECK-NEXT: br i1 [[TMP3]], label [[COND_TRUE_I_I:%.*]], label [[COND_FALSE_I_I:%.*]] +; CHECK: cond.true.i.i: +; CHECK-NEXT: [[AND_I_I:%.*]] = and i64 [[TMP2]], 1 +; CHECK-NEXT: [[ADD_I_I:%.*]] = add nsw i64 [[AND_I_I]], [[TMP2]] +; CHECK-NEXT: br label [[__NODES_WEIGHT_EXIT:%.*]] +; CHECK: cond.false.i.i: +; CHECK-NEXT: [[CALL_I_I:%.*]] = call i32 (i64, ...) bitcast (i32 (...)* @__arch_hweight64 to i32 (i64, ...)*)(i64 [[TMP2]]) +; CHECK-NEXT: [[CONV286_I_I:%.*]] = sext i32 [[CALL_I_I]] to i64 +; CHECK-NEXT: br label [[__NODES_WEIGHT_EXIT]] +; CHECK: __nodes_weight.exit: +; CHECK-NEXT: [[COND_I_I:%.*]] = phi i64 [ [[ADD_I_I]], [[COND_TRUE_I_I]] ], [ [[CONV286_I_I]], [[COND_FALSE_I_I]] ] +; CHECK-NEXT: [[CONV287_I_I:%.*]] = trunc i64 [[COND_I_I]] to i32 +; CHECK-NEXT: store i32 [[CONV287_I_I]], i32* [[__TRANS_TMP_1_I]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[__TRANS_TMP_1_I]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[__TRANS_TMP_1_I]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* [[TMP5]]) +; CHECK-NEXT: br label [[IF_END7:%.*]] +; CHECK: if.end7: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, i32* [[RETVAL]], align 4 +; CHECK-NEXT: ret i32 [[LOAD]] +; + %retval = alloca i32, align 4 + %call6 = call i32 @__nodes_weight() + br label %if.end7 + +if.end7: + %load = load i32, i32* %retval, align 4 + ret i32 %load +} + +declare i1 @llvm.is.constant.i64(i64) +declare dso_local i32 @__arch_hweight64(...)