This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
include/llvm/
-
llvm/
-
Analysis/
4
TargetTransformInfo.h
-
TargetTransformInfoImpl.h
-
CodeGen/
6
BasicTTIImpl.h
1
SwitchCaseCluster.h
-
lib/
-
Analysis/
5/23
InlineCost.cpp
-
TargetTransformInfo.cpp
-
CodeGen/SelectionDAG/
-
SelectionDAG/
3
SwitchCaseCluster.cpp
-
test/Transforms/Inline/AArch64/
-
Transforms/
-
Inline/
-
AArch64/
-
switch.ll

Differential D31085

[InlineCost] Increase the cost of Switch
ClosedPublic

Authored by junbuml on Mar 17 2017, 10:10 AM.

Download Raw Diff

Details

Reviewers

hans
bmakam
chandlerc
eraman
haicheng
mcrosier

Commits

rG919f9e8d65ad: [InlineCost] Improve the cost heuristic for Switch
rL301649: [InlineCost] Improve the cost heuristic for Switch

Summary

The motivation example is like below which has 13 cases but only 2 distinct targets

lor.lhs.false2:                                   ; preds = %if.then
  switch i32 %Status, label %if.then27 [
    i32 -7012, label %if.end35
    i32 -10008, label %if.end35
    i32 -10016, label %if.end35
    i32 15000, label %if.end35
    i32 14013, label %if.end35
    i32 10114, label %if.end35
    i32 10107, label %if.end35
    i32 10105, label %if.end35
    i32 10013, label %if.end35
    i32 10011, label %if.end35
    i32 7008, label %if.end35
    i32 7007, label %if.end35
    i32 5002, label %if.end35
  ]

which is compiled into a balanced binary tree like this on AArch64 (similar on X86)

.LBB853_9:                              // %lor.lhs.false2
        mov     w8, #10012
        cmp             w19, w8
        b.gt    .LBB853_14
// BB#10:                               // %lor.lhs.false2
        mov     w8, #5001
        cmp             w19, w8
        b.gt    .LBB853_18
// BB#11:                               // %lor.lhs.false2
        mov     w8, #-10016
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#12:                               // %lor.lhs.false2
        mov     w8, #-10008
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#13:                               // %lor.lhs.false2
        mov     w8, #-7012
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_14:                             // %lor.lhs.false2
        mov     w8, #14012
        cmp             w19, w8
        b.gt    .LBB853_21
// BB#15:                               // %lor.lhs.false2
        mov     w8, #-10105
        add             w8, w19, w8
        cmp             w8, #9          // =9
        b.hi    .LBB853_17
// BB#16:                               // %lor.lhs.false2
        orr     w9, wzr, #0x1
        lsl     w8, w9, w8
        mov     w9, #517
        and             w8, w8, w9
        cbnz    w8, .LBB853_23
.LBB853_17:                             // %lor.lhs.false2
        mov     w8, #10013
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_18:                             // %lor.lhs.false2
        mov     w8, #-7007
        add             w8, w19, w8
        cmp             w8, #2          // =2
        b.lo    .LBB853_23
// BB#19:                               // %lor.lhs.false2
        mov     w8, #5002
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#20:                               // %lor.lhs.false2
        mov     w8, #10011
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_21:                             // %lor.lhs.false2
        mov     w8, #14013
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#22:                               // %lor.lhs.false2
        mov     w8, #15000
        cmp             w19, w8
        b.ne    .LBB853_3

However, the inline cost model estimates the cost to be linear with the number of distinct targets and the cost of the above switch is just 2 InstrCosts. The function containing this switch is then inlined about 900 times.

This change modifies the model to be linear with the size of the balanced binary tree.

Diff Detail

Event Timeline

junbuml created this revision.Mar 17 2017, 10:10 AM

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptMar 17 2017, 10:10 AM

junbuml mentioned this in D29870: [InlineCost] Increase the cost of Switch.Mar 17 2017, 10:15 AM

Note that this change was originally written by @haicheng in D29870 and I updated his change by using a new TTI hook to get the number of case cluster based on D31080.

I can still see the +8% in performance and -7.63% in size in spec2000/vortex. No significant performance and size change in other benchmarks in spec2000/spec2006.

Can you please also try this on top of the changes from D30333? That's changing one switch transformation to happen much later, making it more sensitive to inline cost estimates.

Can you please also try this on top of the changes from D30333? That's changing one switch transformation to happen much later, making it more sensitive to inline cost estimates.

Thanks I will try this !

junbuml added a reviewer: hans.Mar 20 2017, 11:11 AM

While I'm working on the accurate version of case cluster calculation through the TTI hook in this change based on D31080, I also want to see if it make sense to apply different level of approximations depending on the number of case? So, for example, if the number of case is more than some threshold, we can use a very rough approximation (e.g, use the number case as number of cluster like D29870). Otherwise, we can use a somewhat accurate version of cluster calculation based on D31080.

junbuml mentioned this in D31080: [DAG] Extract switch lowering as a spearate object NFC.Mar 29 2017, 10:51 AM

The below comment by Hans is copied from D31080 :

Before we get any further, I also would like to ask if you have done any measurements of compile-time with this set of patches. As I said before, I think this be quite an expensive hook to call for the inline cost analysis, and it would be nice to see some numbers. If it turns out that it is expensive, perhaps we could come up with some better inline cost heuristic, perhaps something based on the density of the switch.

I think there are three difference cost heuristics :

The cheapest yet inaccurate version is D29870 in which the number of case is simply used as number of cluster.
The most accurate yet expensive version is to use a hook from D31080.
I guess what Hans suggested in above comment (something based on the density of the switch) must be somewhere in between #1 and #2.

Both #1 and #2 consider forming a BTree, and don't require to update the cost heuristic for the changes in switch lowering. I'm not clear about #3 in terms of the accuracy, compile-time, and maintenance. Hans, can you give me little bit more detail?

For me, at least #1 is still better than the current heuristic which simply count the number of distinct successor blocks. If then, would it make sense to use #1 by default and add difference levels of cost heuristics with flags so that we can come up with the most reasonable heuristic and allow others to do experiments ?

In D31085#713391, @junbuml wrote:

The below comment by Hans is copied from D31080 :

Before we get any further, I also would like to ask if you have done any measurements of compile-time with this set of patches. As I said before, I think this be quite an expensive hook to call for the inline cost analysis, and it would be nice to see some numbers. If it turns out that it is expensive, perhaps we could come up with some better inline cost heuristic, perhaps something based on the density of the switch.

I think there are three difference cost heuristics :

The cheapest yet inaccurate version is D29870 in which the number of case is simply used as number of cluster.

The most accurate yet expensive version is to use a hook from D31080.

I guess what Hans suggested in above comment (something based on the density of the switch) must be somewhere in between #1 and #2.

Both #1 and #2 consider forming a BTree, and don't require to update the cost heuristic for the changes in switch lowering. I'm not clear about #3 in terms of the accuracy, compile-time, and maintenance. Hans, can you give me little bit more detail?

For me, at least #1 is still better than the current heuristic which simply count the number of distinct successor blocks. If then, would it make sense to use #1 by default and add difference levels of cost heuristics with flags so that we can come up with the most reasonable heuristic and allow others to do experiments ?

Yes, counting the number of successor blocks doesn't seem right.

I think a decent heuristic might be:

Ask TTI about the native word width, and check if this switch is trivially lowered with a bit test (two successors, case range fits in a machine word); this is very cheap
Ask TTI about jump table density conditions; if the whole switch is dense enough, assume it's a jump table and compute a cost based on that
Otherwise assume it's a balanced tree and estimate the cost based on number of cases (it would be nice not to actually have to build the tree, but just compute an estimate cost using some formula)

This means the estimate will be off for switches that are lowered with a mix of jump tables, binary trees and bit-tests, but those aren't that common, and not being completely accurate is probably fine.

I think going with only #1 might not be so good because we'll overestimate the cost of many switches. But I think computing 1-3 would make for a good a cheap heuristic.

If we are not worry about the mix of cases, I think this is reasonable cheap to be used.

Chandler, do you agree with the heuristic Hans suggested above? Even though it do not cover switches that are lowered with a mix of jump table/bit test/BTree, I think this is reasonable compromise between accuracy and cost of the hook.

Based on Hans' suggestion, checked if switch is suitable for either bit test or jump table. If not suitable for both, use BTree. Please take a look and let me know any comment.

haicheng added inline comments.Apr 10 2017, 9:13 AM

include/llvm/CodeGen/SwitchCaseCluster.h
123	Return
lib/Analysis/InlineCost.cpp
1011–1031	I think we can exit early if the number of cases is too large.
1037–1078	If the estimation chooses to use jumptable, I think we also need to add the cost of the table which is proportional to the range.
lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
86	We can start from begin()+1
test/Transforms/Inline/switch.ll
3 ↗	(On Diff #94559)	We may need to add tests for jump table and bit test.

junbuml added inline comments.Apr 10 2017, 12:37 PM

lib/Analysis/InlineCost.cpp
1011–1031	In SwitchCaseClusterFinder::getEstimatedNumberOfCluster(), we have early exit for a large number of cases. But I guess you mean something else. Can you specify little bit more about the "too large".
1037–1078	I'm not sure if we really need to consider the size of table as a cost. I think just couple of instructions to look up the table and jump to actual blocks need to be considered as cost.
lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
86	Thanks. I will do that.
test/Transforms/Inline/switch.ll
3 ↗	(On Diff #94559)	Yes. I will do that.

junbuml added inline comments.Apr 10 2017, 1:48 PM

lib/Analysis/InlineCost.cpp
1037–1078	You are right Haicheng. Looks like we need to consider the cost of the table as well.

haicheng added inline comments.Apr 10 2017, 2:18 PM

lib/Analysis/InlineCost.cpp
1011–1031	One case needs at least one instruction. So if cost + numcases * instrcost > threshold, we can exit early.

In D31085#715294, @junbuml wrote:

Chandler, do you agree with the heuristic Hans suggested above? Even though it do not cover switches that are lowered with a mix of jump table/bit test/BTree, I think this is reasonable compromise between accuracy and cost of the hook.

Yes, I like this model.

lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
68–69	inlining and loop unrolling. it's a generic cost model.

This really looks like it is going in the right direction. I'm going to work on reviewing some of teh code changes a bit more closely, but I wanted to mention one other thing.

This seems like a really good change to the inlining cost model, but it also seems likely to be a pretty big change. I think it is important to collect some benchmark data to make sure we're not going to uncover a significant regression by surprise. At the very least, I think running the LLVM test suite would be a good start and identifying:

How many benchmarks change
For the ones that change, what is the codesize impact
For the ones that change, what is the runtime impact

For #2 and #3 you probably want at least '-O2', but maybe also '-O3' and '-Os'.

It may be useful to ask others to benchmark other applications and/or various architectures as well. To facilitate that, I might suggest putting the code for this in under a flag that is off and then soliciting benchmark data on llvm-dev with the flag, and based on that data, enable the flag. But if this doesn't fire too often in the test suite, or the results are particularly good, might be easy to just try it and see.

Thanks again for working on this!

Addressed Haicheng's comments and added a flag as Chandler asked. With this update, I kicked off performance tests for the llvm test suite, spec2000, spec2006 in aarch64, but I will be able to share organized data early next as I will be out of office rest of the week. Please let me know any comment.

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

Sure, I can make InlineCost work with just TLI, but I don't want to duplicate the same code for InlineCost from lowering. So I will refactor just a little bit on SelectionDAGBuilder to expose some util functions.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

Can you give me little bit more details about your concern ?

In D31085#728192, @junbuml wrote:

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

Sure, I can make InlineCost work with just TLI, but I don't want to duplicate the same code for InlineCost from lowering. So I will refactor just a little bit on SelectionDAGBuilder to expose some util functions.

That sounds good to me.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

Can you give me little bit more details about your concern ?

The concern is just that the switch lowering code is fairly tightly integrated with SelectionDAGBuilder, and pulling it out isn't worth the effort if all we need from TLI is some basic info, like if the whole switch is dense enough for a lookup table.

Addressed Hans' comments; now the switch cost heuristic just work with TLI.

Please see list of benchmarks changed in LLVM test suite and spec2000/2006. No significant code size regression was found in any config. Overall minor positive impact on code size in LLVM test suite, but in O3 with LTO, there was -7.9% reduce in code size in spec2000/vortex.

In AArch64, I didn't see any clear performance impact in LLVM test suite, but in O3 with LTO, I observed +17.82% performance improvement in spec2000/vertex.

O2 :

Benchmarks	Code size (- is better)
MultiSource/Applications/siod	-0.061%
MultiSource/Applications/hbd	0.000%
MultiSource/Applications/JM/lencod/lencod	-0.070%
MultiSource/Applications/JM/ldecod/ldecod	0.000%
MultiSource/Applications/lua/lua	0.001%
MultiSource/Applications/d/make_dparser	-0.081%
MultiSource/Applications/sqlite3/sqlite3	-0.217%
MultiSource/Benchmarks/Prolangs-C/bison/mybison	0.000%
MultiSource/Benchmarks/MiBench/consumer-typeset/consumer-typeset	0.000%
MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg	-0.082%
MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg	0.000%
MultiSource/Benchmarks/MallocBench/gs/gs	0.000%
spec2000/perlbmk	-0.066%
spec2000/gcc	-0.183%
spec2000/parser	0.000%
spec2000/crafty	-0.080%
spec2000/mesa	-0.286%
spec2006/soplex	0.000%
spec2006/xalancbmk	0.000%
spec2006/hmmer	-0.072%
spec2006/gcc	-0.191%
spec2006/h264ref	0.000%
spec2006/povray	-0.057%
spec2006/perlbench	-0.060%

Os :

Benchmarks	Code size(- is better)
MultiSource/Applications/siod/siod	-0.123%
MultiSource/Applications/JM/lencod/lencod	0.000%
MultiSource/Applications/lua/lua	0.000%
MultiSource/Applications/sqlite3/sqlite3	0.001%
MultiSource/Benchmarks/mafft/pairlocalalign	0.000%
MultiSource/Benchmarks/7zip/7zip-benchmark	0.001%
spec2000/perlbmk	0.000%
spec2000/gap	0.000%
spec2000/gcc	0.000%
spec2000/mesa	-0.071%
spec2000/vortex	0.000%
spec2006/gobmk	0.000%
spec2006/xalancbmk	0.000%
spec2006/hmmer	0.000%
spec2006/gcc	-0.048%
spec2006/omnetpp	0.000%
spec2006/h264ref	0.000%
spec2006/perlbench	-0.062%

O3 :

Benchmarks	Code size(- is better)
MultiSource/Applications/kimwitu++/kc	0.000%
MultiSource/Applications/siod/siod	-0.061%
MultiSource/Applications/hbd/hbd	0.000%
MultiSource/Applications/JM/lencod/lencod	0.000%
MultiSource/Applications/lua/lua	0.001%
MultiSource/Applications/SIBsim4/SIBsim4	0.001%
MultiSource/Applications/d/make_dparser	0.000%
MultiSource/Applications/sqlite3/sqlite3	-0.212%
MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg	0.000%
MultiSource/Benchmarks/MallocBench/gs/gs	0.000%
MultiSource/Benchmarks/7zip/7zip-benchmark	0.000%
spec2000/perlbmk	-0.066%
spec2000/gcc	-0.182%
spec2000/mesa	-0.356%
spec2000/vortex	0.000%
spec2006/soplex	0.000%
spec2006/xalancbmk	-0.019%
spec2006/bzip2	0.000%
spec2006/hmmer	0.000%
spec2006/gcc	-0.188%
spec2006/h264ref	0.000%
spec2006/povray	-0.056%
spec2006/perlbench	-0.121%

Spec2000/2006 performance in O3 :

Benchmarks	Score(+ is better)
spec2000/perlbmk	+2.706%
spec2000/vortex	+2.028%
spec2000/mesa	-1.76%
spec2006/soplex	-1.334%
spec2006/povray	+1.509%
spec2006/perlbench	+0.757%

Spec2000/2006 performance in O3 with LTO :

Benchmarks	Score(+ is better)
spec2000/gzip	+2.180%
spec2000/mesa	-4.093%
spec2000/vortex	+17.822%

It's starting to look much simpler, which is great.

include/llvm/Analysis/TargetTransformInfo.h
764	The other function here return an estimated cost for lowering. I think that would be a better interface for this too.
include/llvm/CodeGen/BasicTTIImpl.h
176	I wish this could be much simpler. Maybe most of the code could defer to TLI::isSuitableForBitTest / isSuitableForJumpTable which could also be used from the DAG code.
226	If we have TLI::isSuitableForBitTests, maybe we should have isSuitableForJumpTable too, that way we don't have to duplicate as much logic, and that one could do what areJTsAllowed() as well.
lib/Analysis/InlineCost.cpp
1055	If the whole switch is suitable for a jump table, can't we just return after this?
1057	Why do we have to do this? The code is basically constructing the tree and throwing it away. It should be possible to compute an estimate for the size of the tree with a closed-form mathematical expression.

haicheng added inline comments.Apr 20 2017, 6:37 AM

lib/Analysis/InlineCost.cpp
1057	If n is the case number, f(n) is the mapping from the case number to the node number of BTree. The recursion is f(n) = 1 + f(n/2) + f (n - n/2), when n > 3. So, f(n) is between n + 2^(log2(n) - 1) - 1 and n + 2^(log2(n)) - 1. The lower bound is about 1.5n - 1 and the upper bound is about 2n - 1

haicheng added inline comments.Apr 20 2017, 8:48 AM

lib/Analysis/InlineCost.cpp
1057	The exact equation is f(n) = n, n <= 3 f(n) = n + 2^(log(n) - 1) - 1, n > 3 && 2^log(n) <= n <= 1.52^(log(n)) f(n) = 2n - 2^(log(n)) - 1, n > 3 && 1.52^(log(n)) < n < 2^(log(n)+1)

junbuml updated this revision to Diff 96252.Apr 21 2017, 3:31 PM

junbuml added inline comments.

include/llvm/Analysis/TargetTransformInfo.h
764	I doubt if this is good place to get the inline cost because other functions handle user costs which is different from inline cost. Mixing the user cost and inline cost here might be a bad choice. I believe the inline cost should be decided in InlineCost.
include/llvm/CodeGen/BasicTTIImpl.h
176	Tried to make it simpler, but we still need to find Min/MaxValue here without forming CaseClusters and it's also good to check IsJTAllowed early before doing the actual suitability check to avoid iterating the for loop to find Min/MaxValue in case not allowed. Please take a look and let me know if there is any part you want to move in either isSuitableForBitTest or isSuitableForJumpTable.
226	Added isSuitableForJumpTable in TLI which used in here and DAG, but keep areJTsAllowed() as a separate function because areJTsAllowed() need to be checked only once in findJumpTable() in DAG, and we can also hit the early exit in this function when JT is not allowed.
lib/Analysis/InlineCost.cpp
1057	Thanks Haicheng for this. If n is a power of 2, the number of node should be n + 2^(log2(n) - 1) - 1. For non-power of 2 cases , the lower bound is n + 2^(floor(log2(n)) - 1) - 1 and the upper bound is n + 2^(ceiling(log2(n)) - 1) - 1. As a estimation, I think the use of upper bound is simple and conservative enough.

hans added inline comments.Apr 21 2017, 4:35 PM

include/llvm/Analysis/TargetTransformInfo.h
764	I was thinking it doesn't need to be used as the inlining cost directly, but exposing some metric the inline cost model could then make use of - that's kind of what we're doing anyway. What I don't like about this one is that it feels a little out of place compared to other functions which return an estimate of the cost of lowering an instruction, whereas this seems to return details about the lowering instead. I'm not sure how to make this better though.
include/llvm/Target/TargetLowering.h
766 ↗	(On Diff #96252)	NumCluster -> NumClusters please
lib/Analysis/InlineCost.cpp
1061	I'm not sure it's a BTree, but rather a regular binary search tree.
1074	Very nice! I think the code can be simplified though, and this will allow us to avoid the rounding from Log2_64_Ceil: (N = NumCaseCluster) `NumberOfNonLeaf = 2^(log2(N) - 1) - 1 = 2^log2(N) * 2^-1 - 1 = N/2 - 1` Adding NumCaseCluster to that yields `N + N / 2 - 1 = N * 3 / 2 - 1`. I kind of wish this is all there was to this change and we didn't need all that other code: quick check if the switch is suitable for a bit test or jump table, otherwise compute a cost based on `N * 3 / 2 - 1`. I don't know if it's possible though.
1079	Can we really run into INT_MAX here? Especially given the Threshold check above?
1081	I would suggest handling the NumCaseCluster <= 3 first and returning early for that, since it's the less complicated case.

Addressed Hans' comments.

include/llvm/Analysis/TargetTransformInfo.h
764	I see what you meant and I can see functions which return the lowering cost. However, this file also hold functions which expose details about the machine directly to be used in IR-level. For me it seems that exposing this information from TLI to InlineCost still keep the original intention of this class. I will be happy to hear any better suggestion about the way of exposing this lowering information.
lib/Analysis/InlineCost.cpp
1074	Thanks Hans for this. Updated it based your comments. I think it's difficult to use a closed form which cover all lowering cases, especially in different targets. I believe we need to differentiate each lowering case as the costs varying depending on the lowering cases and targets.
1079	In very extreme case, it could happen. For a very large Threshold (e.g., INT_MAX), even though SI.getNumCases() * InlineConstants::InstrCost is smaller than the Threshold. ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost could hit INT_MAX.

Apologies for the reviews dragging out.

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.

The smaller you can make this change, the easier it will be to get this committed.

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	This check should probably be done in `isSuitableForJumpTable`. There should be no need to pass NumClusters to `areJTsAllowed` since whether jump tables are allowed only depends on the target and function. The switch instruction also shouldn't be passed in, just the function.
779 ↗	(On Diff #96464)	This is used in isSuitableForBitTests, but does it really need to be exposed in the TargetLowering interface?
792 ↗	(On Diff #96464)	What's the difference between Range and JumpTableSize? As far as I can tell, those are always the same.
lib/Analysis/InlineCost.cpp
1083	Aren't you adding `Cost` twice here? You're doing `Cost +=` and also `SwitchCost + Cost`. Oh, Cost is just a regular int; yeah then I see how it can overflow. But std::min is returnning an uint64_t here, so it seems you're still not handling the overflow?

hans added inline comments.Apr 25 2017, 11:00 AM

lib/Analysis/InlineCost.cpp
1083	Oh never mind, you're capping the uint64_t at INT_MAX so it shuold work. Probably still don't want to do `Cost +=` though.

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.
The smaller you can make this change, the easier it will be to get this committed.

For me, it seems that the code change in DAG and TLI only make sense when reviewed together with the changes in InlineCost. That's why I put them together. If you generally agree with the change in DAG side code, I can break it as a separate patch, and leave only inliner side change in here.

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	Yes, I agree that areJTsAllowed() should see only function and target, so I just pass a Function to areJTsAllowed(). But I don't think isSuitableForJumpTable is good place for this check because this check is for the whole clusters of a switch, but isSuitableForJumpTable should see a set of clusters, not necessary the whole clusters, especially when we try to build jump tables for split clusters in DAG. We also need to do an early exit when this check hit. So I do this check in findJumpTables() in DAG and getEstimatedNumberOfCaseClusters() in TTI.
779 ↗	(On Diff #96464)	This is also used in findBitTestClusters and buildBitTests(), and for me doing this check through TLI doesn't seem to be weird. Please let me know if you see any better place for this function.
792 ↗	(On Diff #96464)	Yes, these are the same.
lib/Analysis/InlineCost.cpp
1083	Thanks for this. Yes, it should be Cost =, instead of Cost +=.

In D31085#737231, @junbuml wrote:

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.
The smaller you can make this change, the easier it will be to get this committed.

For me, it seems that the code change in DAG and TLI only make sense when reviewed together with the changes in InlineCost. That's why I put them together. If you generally agree with the change in DAG side code, I can break it as a separate patch, and leave only inliner side change in here.

Yes, reviewing them together makes total sense. I just wish the change we're making were smaller.

Anyway, I think this is really close now. Each time I read through it, it looks better :-)

include/llvm/CodeGen/BasicTTIImpl.h
195	Declaring CI and starting to increment it outside the for-loop is a little unusual. I realize this is to avoid repeating the first iteration (maybe I wrote this somewhere?), but I think it would be better if this were written as a straight-forward loop: APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); APInt MinCaseVal = SI.case_begin()->getCaseValue()->getValue(); for (auto CI : SI.cases()) { ... }
include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	isSuitableForJumpTable() seems like a good place for this to me. It should be able to handle the whole switch range or a subset in the same way.
lib/Analysis/InlineCost.cpp
1044	Could this overflow?
1083	Cool, makes sense now. Are the `(uint64_t)` casts strictly necessary though? SwitchCost is already `uint64_t` so I'd imagine `INT_MAX` to get promoted and everything to work out?

junbuml added inline comments.Apr 26 2017, 10:34 AM

include/llvm/CodeGen/BasicTTIImpl.h
195	Yes, I like this. I will do that.
include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	If we use range for this check in isSuitableForJumpTable(), it will change the behavior in lowering. Current lowering code use the total number of cluster (N) for this check only once at the beginning of findJumpTables(), which is different from both Range and NumCases. And also when partitioning clusters for jump tables, doing this check in isSuitableForJumpTable() may change the behavior of the loop finding partitions. If either Range or NumCases should be used for this check, I think it should be a separate patch.

hans added inline comments.Apr 26 2017, 11:15 AM

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	Oh, I see what you mean. It's annoying that we'll end up with a `isSuitableForJumpTable()` that doesn't take the minimum size into account though :-/

junbuml updated this revision to Diff 96799.Apr 26 2017, 11:55 AM

junbuml marked 3 inline comments as done.

junbuml added inline comments.

lib/Analysis/InlineCost.cpp
1083	We need (uint64_t)INT_MAX, but we don't need (uint64_t) Cost as SwitchCost is uint64_t.

hans added inline comments.Apr 26 2017, 2:04 PM

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	It still seems backward that this is checking density and maximum table size, but that the minimum size is expected to be checked elsewhere :-/ There must be some better way to factor this.

junbuml added inline comments.Apr 26 2017, 2:30 PM

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	I agree that doing minSize check in isSuitableForJumpTable looks clear. However, I want to keep the changes in lowering side in this patch is NFC. That's why we cannot do this check in isSuitableForJumpTable(). If you want to add MinSize check in isSuitableForJumpTable(), I think we should also pass the cluster size (N) to isSuitableForJumpTable() in findJumpTable() because Range and NumCases could be different from the cluster size(N). Only way I can think of doing this check in isSuitableForJumpTable() is : pass the cluster size (N) to isSuitableForJumpTable() for the first call of isSuitableForJumpTable() in findJumpTable() because this is for the whole range. Since the second calls of isSuitableForJumpTable() in findJumpTable for partitionized clusters should not check MinSize with sub clusters, we do not use isSuitableForJumpTable(), instead checking the max size and density separately just like current code. However, I think this way is even worse than my current code. Please let me know your thought.

lgtm

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	Let's put a FIXME in isSuitableForJumpTable that it would be nice if the min size check could somehow be combined with the other checks here.

This revision is now accepted and ready to land.Apr 26 2017, 2:48 PM

Added FIXME as Hans suggested.
I will commit it if there is any further comment from other reviewers by tomorrow. I will also post a follow-up patch to enable the flag (-inline-generic-switch-cost).

Thank you very much for the review.

Closed by commit rL301649: [InlineCost] Improve the cost heuristic for Switch (authored by junbuml). · Explain WhyApr 28 2017, 9:17 AM

This revision was automatically updated to reflect the committed changes.

tejohnson mentioned this in D67716: [Inliner] Remove incorrect early exit during switch cost computation.Sep 18 2019, 9:25 AM

tejohnson mentioned this in rL372440: [Inliner] Remove incorrect early exit during switch cost computation.Sep 20 2019, 4:28 PM

tejohnson mentioned this in rG2f32e5d84d34: [Inliner] Remove incorrect early exit during switch cost computation.

Revision Contents

Path

Size

include/

llvm/

Analysis/

TargetTransformInfo.h

12 lines

TargetTransformInfoImpl.h

5 lines

CodeGen/

BasicTTIImpl.h

11 lines

SwitchCaseCluster.h

5 lines

lib/

Analysis/

InlineCost.cpp

79 lines

TargetTransformInfo.cpp

5 lines

CodeGen/

SelectionDAG/

SwitchCaseCluster.cpp

58 lines

test/

Transforms/

Inline/

AArch64/

switch.ll

123 lines

Diff 94880

include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 191 Lines • ▼ Show 20 Lines	int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys) const;		ArrayRef<Type *> ParamTys) const;

/// \brief Estimate the cost of an intrinsic when lowered.		/// \brief Estimate the cost of an intrinsic when lowered.
///		///
/// Mirrors the \c getCallCost method but uses an intrinsic identifier.		/// Mirrors the \c getCallCost method but uses an intrinsic identifier.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) const;		ArrayRef<const Value *> Arguments) const;

		/// \return The estimated number of case clusters for the 'SI' when lowered.
		/// \p JTSize Set a jump table size only when /p SI is suitable for a jump
		/// table.
		int getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned *JTSize = nullptr) const;

/// \brief Estimate the cost of a given IR user when lowered.		/// \brief Estimate the cost of a given IR user when lowered.
///		///
/// This can estimate the cost of either a ConstantExpr or Instruction when		/// This can estimate the cost of either a ConstantExpr or Instruction when
/// lowered. It has two primary advantages over the \c getOperationCost and		/// lowered. It has two primary advantages over the \c getOperationCost and
/// \c getGEPCost above, and one significant disadvantage: it can only be		/// \c getGEPCost above, and one significant disadvantage: it can only be
/// used when the IR construct has already been formed.		/// used when the IR construct has already been formed.
///		///
/// The advantages are that it can inspect the SSA use graph to reason more		/// The advantages are that it can inspect the SSA use graph to reason more
▲ Show 20 Lines • Show All 542 Lines • ▼ Show 20 Lines	public:
virtual int getCallCost(const Function *F, int NumArgs) = 0;		virtual int getCallCost(const Function *F, int NumArgs) = 0;
virtual int getCallCost(const Function *F,		virtual int getCallCost(const Function *F,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
virtual unsigned getInliningThresholdMultiplier() = 0;		virtual unsigned getInliningThresholdMultiplier() = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys) = 0;		ArrayRef<Type *> ParamTys) = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
		virtual int getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		hansUnsubmitted Not Done Reply Inline Actions The other function here return an estimated cost for lowering. I think that would be a better interface for this too. hans: The other function here return an estimated cost for lowering. I think that would be a better…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions I doubt if this is good place to get the inline cost because other functions handle user costs which is different from inline cost. Mixing the user cost and inline cost here might be a bad choice. I believe the inline cost should be decided in InlineCost. junbuml: I doubt if this is good place to get the inline cost because other functions handle user costs…
		hansUnsubmitted Not Done Reply Inline Actions I was thinking it doesn't need to be used as the inlining cost directly, but exposing some metric the inline cost model could then make use of - that's kind of what we're doing anyway. What I don't like about this one is that it feels a little out of place compared to other functions which return an estimate of the cost of lowering an instruction, whereas this seems to return details about the lowering instead. I'm not sure how to make this better though. hans: I was thinking it doesn't need to be used as the inlining cost directly, but exposing some…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions I see what you meant and I can see functions which return the lowering cost. However, this file also hold functions which expose details about the machine directly to be used in IR-level. For me it seems that exposing this information from TLI to InlineCost still keep the original intention of this class. I will be happy to hear any better suggestion about the way of exposing this lowering information. junbuml: I see what you meant and I can see functions which return the lowering cost. However, this file…
		unsigned *JTSize = nullptr) = 0;
virtual int getUserCost(const User *U) = 0;		virtual int getUserCost(const User *U) = 0;
virtual bool hasBranchDivergence() = 0;		virtual bool hasBranchDivergence() = 0;
virtual bool isSourceOfDivergence(const Value *V) = 0;		virtual bool isSourceOfDivergence(const Value *V) = 0;
virtual unsigned getFlatAddressSpace() = 0;		virtual unsigned getFlatAddressSpace() = 0;
virtual bool isLoweredToCall(const Function *F) = 0;		virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;		virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;		virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;		virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
▲ Show 20 Lines • Show All 281 Lines • ▼ Show 20 Lines	unsigned getMinPrefetchStride() override {
return Impl.getMinPrefetchStride();		return Impl.getMinPrefetchStride();
}		}
unsigned getMaxPrefetchIterationsAhead() override {		unsigned getMaxPrefetchIterationsAhead() override {
return Impl.getMaxPrefetchIterationsAhead();		return Impl.getMaxPrefetchIterationsAhead();
}		}
unsigned getMaxInterleaveFactor(unsigned VF) override {		unsigned getMaxInterleaveFactor(unsigned VF) override {
return Impl.getMaxInterleaveFactor(VF);		return Impl.getMaxInterleaveFactor(VF);
}		}
		int getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned *JTSize = nullptr) override {
		return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize);
		}
unsigned		unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,		getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info,		OperandValueKind Opd2Info,
OperandValueProperties Opd1PropInfo,		OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo,		OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args) override {		ArrayRef<const Value *> Args) override {
return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,		return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args);		Opd1PropInfo, Opd2PropInfo, Args);
▲ Show 20 Lines • Show All 213 Lines • Show Last 20 Lines

include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	int getGEPCost(Type PointeeType, const Value Ptr,
// into their uses via addressing modes.		// into their uses via addressing modes.
for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)		for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
if (!isa<Constant>(Operands[Idx]))		if (!isa<Constant>(Operands[Idx]))
return TTI::TCC_Basic;		return TTI::TCC_Basic;

return TTI::TCC_Free;		return TTI::TCC_Free;
}		}

		int getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned *JTSize = nullptr) {
		return SI.getNumCases();
		}

unsigned getCallCost(FunctionType *FTy, int NumArgs) {		unsigned getCallCost(FunctionType *FTy, int NumArgs) {
assert(FTy && "FunctionType must be provided to this routine.");		assert(FTy && "FunctionType must be provided to this routine.");

// The target-independent implementation just measures the size of the		// The target-independent implementation just measures the size of the
// function by approximating that each argument will take on average one		// function by approximating that each argument will take on average one
// instruction to prepare.		// instruction to prepare.

if (NumArgs < 0)		if (NumArgs < 0)
▲ Show 20 Lines • Show All 557 Lines • Show Last 20 Lines

include/llvm/CodeGen/BasicTTIImpl.h

Show All 11 Lines
/// interfaces.		/// interfaces.
///		///
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#ifndef LLVM_CODEGEN_BASICTTIIMPL_H		#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
#define LLVM_CODEGEN_BASICTTIIMPL_H		#define LLVM_CODEGEN_BASICTTIIMPL_H

#include "llvm/Analysis/LoopInfo.h"		#include "llvm/Analysis/LoopInfo.h"
		#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfoImpl.h"		#include "llvm/Analysis/TargetTransformInfoImpl.h"
		#include "llvm/CodeGen/SwitchCaseCluster.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetLowering.h"		#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetSubtargetInfo.h"		#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"

namespace llvm {		namespace llvm {

extern cl::opt<unsigned> PartialUnrollingThreshold;		extern cl::opt<unsigned> PartialUnrollingThreshold;

/// \brief Base class which can be used to help build a TTI implementation.		/// \brief Base class which can be used to help build a TTI implementation.
///		///
/// This class provides as much implementation of the TTI interface as is		/// This class provides as much implementation of the TTI interface as is
▲ Show 20 Lines • Show All 133 Lines • ▼ Show 20 Lines	if (IID == Intrinsic::ctlz) {
if (getTLI()->isCheapToSpeculateCtlz())		if (getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;		return TargetTransformInfo::TCC_Basic;
return TargetTransformInfo::TCC_Expensive;		return TargetTransformInfo::TCC_Expensive;
}		}

return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);		return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
}		}

		int getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned *JumpTableSize) {
		hansUnsubmitted Not Done Reply Inline Actions I wish this could be much simpler. Maybe most of the code could defer to TLI::isSuitableForBitTest / isSuitableForJumpTable which could also be used from the DAG code. hans: I wish this could be much simpler. Maybe most of the code could defer to TLI…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Tried to make it simpler, but we still need to find Min/MaxValue here without forming CaseClusters and it's also good to check IsJTAllowed early before doing the actual suitability check to avoid iterating the for loop to find Min/MaxValue in case not allowed. Please take a look and let me know if there is any part you want to move in either isSuitableForBitTest or isSuitableForJumpTable. junbuml: Tried to make it simpler, but we still need to find Min/MaxValue here without forming…
		SwitchCaseClusterFinder CaseClusters(
		this->getDataLayout(), *getST()->getTargetLowering(),
		getTLI()->getTargetMachine().getOptLevel());
		return CaseClusters.getEstimatedNumberOfClusters(SI, *JumpTableSize);
		}

unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }		unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }

unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }		unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }

bool shouldBuildLookupTables() {		bool shouldBuildLookupTables() {
const TargetLoweringBase *TLI = getTLI();		const TargetLoweringBase *TLI = getTLI();
return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|		return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);		TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
}		}

bool haveFastSqrt(Type *Ty) {		bool haveFastSqrt(Type *Ty) {
const TargetLoweringBase *TLI = getTLI();		const TargetLoweringBase *TLI = getTLI();
EVT VT = TLI->getValueType(DL, Ty);		EVT VT = TLI->getValueType(DL, Ty);
		hansUnsubmitted Not Done Reply Inline Actions Declaring CI and starting to increment it outside the for-loop is a little unusual. I realize this is to avoid repeating the first iteration (maybe I wrote this somewhere?), but I think it would be better if this were written as a straight-forward loop: APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); APInt MinCaseVal = SI.case_begin()->getCaseValue()->getValue(); for (auto CI : SI.cases()) { ... } hans: Declaring CI and starting to increment it outside the for-loop is a little unusual. I realize…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Yes, I like this. I will do that. junbuml: Yes, I like this. I will do that.
return TLI->isTypeLegal(VT) &&		return TLI->isTypeLegal(VT) &&
TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);		TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
}		}

unsigned getFPOpCost(Type *Ty) {		unsigned getFPOpCost(Type *Ty) {
// By default, FP instructions are no more expensive since they are		// By default, FP instructions are no more expensive since they are
// implemented in HW. Target specific TTI can override this.		// implemented in HW. Target specific TTI can override this.
return TargetTransformInfo::TCC_Basic;		return TargetTransformInfo::TCC_Basic;
Show All 14 Lines	case Instruction::ZExt: {
return TargetTransformInfo::TCC_Basic;		return TargetTransformInfo::TCC_Basic;
}		}
}		}

return BaseT::getOperationCost(Opcode, Ty, OpTy);		return BaseT::getOperationCost(Opcode, Ty, OpTy);
}		}

unsigned getInliningThresholdMultiplier() { return 1; }		unsigned getInliningThresholdMultiplier() { return 1; }

		hansUnsubmitted Not Done Reply Inline Actions If we have TLI::isSuitableForBitTests, maybe we should have isSuitableForJumpTable too, that way we don't have to duplicate as much logic, and that one could do what areJTsAllowed() as well. hans: If we have TLI::isSuitableForBitTests, maybe we should have isSuitableForJumpTable too, that…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Added isSuitableForJumpTable in TLI which used in here and DAG, but keep areJTsAllowed() as a separate function because areJTsAllowed() need to be checked only once in findJumpTable() in DAG, and we can also hit the early exit in this function when JT is not allowed. junbuml: Added isSuitableForJumpTable in TLI which used in here and DAG, but keep areJTsAllowed() as a…
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {		void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) {
// This unrolling functionality is target independent, but to provide some		// This unrolling functionality is target independent, but to provide some
// motivation for its intended use, for x86:		// motivation for its intended use, for x86:

// According to the Intel 64 and IA-32 Architectures Optimization Reference		// According to the Intel 64 and IA-32 Architectures Optimization Reference
// Manual, Intel Core models and later have a loop stream detector (and		// Manual, Intel Core models and later have a loop stream detector (and
// associated uop queue) that can benefit from partial unrolling.		// associated uop queue) that can benefit from partial unrolling.
// The relevant requirements are:		// The relevant requirements are:
▲ Show 20 Lines • Show All 871 Lines • Show Last 20 Lines

include/llvm/CodeGen/SwitchCaseCluster.h

Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	static bool rangeFitsInWord(const APInt &Low, const APInt &High,
uint64_t BW = DL.getPointerSizeInBits();		uint64_t BW = DL.getPointerSizeInBits();
uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;		uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
return Range <= BW;		return Range <= BW;
}		}

/// Calculate clusters for cases in SI and store them in Clusters.		/// Calculate clusters for cases in SI and store them in Clusters.
const BasicBlock *findClusters(const SwitchInst &SI,		const BasicBlock *findClusters(const SwitchInst &SI,
CaseClusterVector &Clusters);		CaseClusterVector &Clusters);

		/// Return the estimated number of clusters.
		haichengUnsubmitted Not Done Reply Inline Actions Return haicheng: Return
		unsigned getEstimatedNumberOfClusters(const SwitchInst &SI,
		unsigned &JumptableSize);

private:		private:
/// Extract cases from the switch and build initial form of case clusters.		/// Extract cases from the switch and build initial form of case clusters.
void formInitalCaseClusers(const SwitchInst &SI, CaseClusterVector &Clusters);		void formInitalCaseClusers(const SwitchInst &SI, CaseClusterVector &Clusters);

/// Find clusters of cases suitable for jump table lowering.		/// Find clusters of cases suitable for jump table lowering.
void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI);		void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI);

/// Find clusters of cases suitable for bit test lowering.		/// Find clusters of cases suitable for bit test lowering.
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

lib/Analysis/InlineCost.cpp

Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	static cl::opt<int> HintThreshold(
"inlinehint-threshold", cl::Hidden, cl::init(325),		"inlinehint-threshold", cl::Hidden, cl::init(325),
cl::desc("Threshold for inlining functions with inline hint"));		cl::desc("Threshold for inlining functions with inline hint"));

static cl::opt<int>		static cl::opt<int>
ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,		ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
cl::init(45),		cl::init(45),
cl::desc("Threshold for inlining cold callsites"));		cl::desc("Threshold for inlining cold callsites"));

		static cl::opt<bool>
		EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
		cl::init(false),
		cl::desc("Enable generic switch cost model"));

// We introduce this threshold to help performance of instrumentation based		// We introduce this threshold to help performance of instrumentation based
// PGO before we actually hook up inliner with analysis passes such as BPI and		// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.		// BFI.
static cl::opt<int> ColdThreshold(		static cl::opt<int> ColdThreshold(
"inlinecold-threshold", cl::Hidden, cl::init(225),		"inlinecold-threshold", cl::Hidden, cl::init(225),
cl::desc("Threshold for inlining functions with cold attribute"));		cl::desc("Threshold for inlining functions with cold attribute"));

static cl::opt<int>		static cl::opt<int>
▲ Show 20 Lines • Show All 933 Lines • ▼ Show 20 Lines	bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
// We model unconditional switches as free, see the comments on handling		// We model unconditional switches as free, see the comments on handling
// branches.		// branches.
if (isa<ConstantInt>(SI.getCondition()))		if (isa<ConstantInt>(SI.getCondition()))
return true;		return true;
if (Value *V = SimplifiedValues.lookup(SI.getCondition()))		if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
if (isa<ConstantInt>(V))		if (isa<ConstantInt>(V))
return true;		return true;

// Otherwise, we need to accumulate a cost proportional to the number of		if (!EnableGenericSwitchCost) {
// distinct successor blocks. This fan-out in the CFG cannot be represented		// In this simple switch cost model, we accumulate a cost proportional to
// for free even if we can represent the core switch as a jumptable that		// the number of distinct successor blocks. This fan-out in the CFG cannot
// takes a single instruction.		// be represented for free even if we can represent the core switch as a
//		// jumptable that takes a single instruction.
// NB: We convert large switches which are just used to initialize large phi
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
// inlining those. It will prevent inlining in cases where the optimization
// does not (yet) fire.
SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;		SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
SuccessorBlocks.insert(SI.getDefaultDest());		SuccessorBlocks.insert(SI.getDefaultDest());
for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)		for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
SuccessorBlocks.insert(I.getCaseSuccessor());		SuccessorBlocks.insert(I.getCaseSuccessor());
// Add cost corresponding to the number of distinct destinations. The first		// Add cost corresponding to the number of distinct destinations. The first
// we model as free because of fallthrough.		// we model as free because of fallthrough.
Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;		Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
return false;		return false;
}		}

		// Otherwise, we assume the most general case where the swith is lowered into
		// either a jump table, bit test, or a balanced binary tree consisting of
		// case clusters without merging adjacent clusters with the same destination.
		// We do not consider the switches that are lowered with a mix of jump table/
		// bit test/BTree. The cost of the switch is proportional to the size of
		// the tree or the size of jump table range.
		haichengUnsubmitted Not Done Reply Inline Actions I think we can exit early if the number of cases is too large. haicheng: I think we can exit early if the number of cases is too large.
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions In SwitchCaseClusterFinder::getEstimatedNumberOfCluster(), we have early exit for a large number of cases. But I guess you mean something else. Can you specify little bit more about the "too large". junbuml: In SwitchCaseClusterFinder::getEstimatedNumberOfCluster(), we have early exit for a large…
		haichengUnsubmitted Not Done Reply Inline Actions One case needs at least one instruction. So if cost + numcases * instrcost > threshold, we can exit early. haicheng: One case needs at least one instruction. So if cost + numcases * instrcost > threshold, we can…
		//
		// NB: We convert large switches which are just used to initialize large phi
		// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
		// inlining those. It will prevent inlining in cases where the optimization
		// does not (yet) fire.

		// Exit early for a large switch, assuming one case needs at least one
		// instruction.
		// FIXME: This is not true for a bit test, but ignore such case for now to
		// save compile-time.
		int CostLowerBound = Cost + SI.getNumCases() * InlineConstants::InstrCost;
		if (CostLowerBound > Threshold) {
		Cost = CostLowerBound;
		hansUnsubmitted Done Reply Inline Actions Could this overflow? hans: Could this overflow?
		return false;
		}

		unsigned JumpTableSize = 0;
		int NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, &JumpTableSize);
		SmallVector<unsigned, 4> SwitchWorkList;
		SwitchWorkList.push_back(NumCaseCluster);
		Cost -= InlineConstants::InstrCost;

		// If suitable for a jump table, consider the jump table size.
		if (JumpTableSize)
		hansUnsubmitted Not Done Reply Inline Actions If the whole switch is suitable for a jump table, can't we just return after this? hans: If the whole switch is suitable for a jump table, can't we just return after this?
		Cost += JumpTableSize * InlineConstants::InstrCost;

		hansUnsubmitted Not Done Reply Inline Actions Why do we have to do this? The code is basically constructing the tree and throwing it away. It should be possible to compute an estimate for the size of the tree with a closed-form mathematical expression. hans: Why do we have to do this? The code is basically constructing the tree and throwing it away.
		haichengUnsubmitted Not Done Reply Inline Actions If n is the case number, f(n) is the mapping from the case number to the node number of BTree. The recursion is f(n) = 1 + f(n/2) + f (n - n/2), when n > 3. So, f(n) is between n + 2^(log2(n) - 1) - 1 and n + 2^(log2(n)) - 1. The lower bound is about 1.5n - 1 and the upper bound is about 2n - 1 haicheng: If n is the case number, f(n) is the mapping from the case number to the node number of BTree.
		haichengUnsubmitted Not Done Reply Inline Actions The exact equation is f(n) = n, n <= 3 f(n) = n + 2^(log(n) - 1) - 1, n > 3 && 2^log(n) <= n <= 1.52^(log(n)) f(n) = 2n - 2^(log(n)) - 1, n > 3 && 1.52^(log(n)) < n < 2^(log(n)+1) haicheng: The exact equation is f(n) = n, n <= 3 f(n) = n + 2^(log(n) - 1) - 1, n > 3 && 2^log(n) <= n…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Thanks Haicheng for this. If n is a power of 2, the number of node should be n + 2^(log2(n) - 1) - 1. For non-power of 2 cases , the lower bound is n + 2^(floor(log2(n)) - 1) - 1 and the upper bound is n + 2^(ceiling(log2(n)) - 1) - 1. As a estimation, I think the use of upper bound is simple and conservative enough. junbuml: Thanks Haicheng for this. If n is a power of 2, the number of node should be n + 2^(log2(n)…
		while (!SwitchWorkList.empty()) {
		unsigned NumCases = SwitchWorkList.back();
		SwitchWorkList.pop_back();
		if (NumCases <= 3)
		hansUnsubmitted Done Reply Inline Actions I'm not sure it's a BTree, but rather a regular binary search tree. hans: I'm not sure it's a BTree, but rather a regular binary search tree.
		// Do not split the tree if the number of remaining cases is less than 3.
		// Just compare switch condition with each case value. Suppose each
		// comparison includes one compare and one conditional branch.
		Cost += (2 * NumCases * InlineConstants::InstrCost);
		else {
		// Split the remaining nodes and add one more comparison.
		unsigned NumLeft = NumCases / 2;
		unsigned NumRight = NumCases - NumLeft;
		SwitchWorkList.push_back(NumLeft);
		SwitchWorkList.push_back(NumRight);
		Cost += (2 * InlineConstants::InstrCost);
		}
		// Exit early if Cost is already larger than Threshold.
		hansUnsubmitted Not Done Reply Inline Actions Very nice! I think the code can be simplified though, and this will allow us to avoid the rounding from Log2_64_Ceil: (N = NumCaseCluster) `NumberOfNonLeaf = 2^(log2(N) - 1) - 1 = 2^log2(N) * 2^-1 - 1 = N/2 - 1` Adding NumCaseCluster to that yields `N + N / 2 - 1 = N * 3 / 2 - 1`. I kind of wish this is all there was to this change and we didn't need all that other code: quick check if the switch is suitable for a bit test or jump table, otherwise compute a cost based on `N * 3 / 2 - 1`. I don't know if it's possible though. hans: Very nice! I think the code can be simplified though, and this will allow us to avoid the…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Thanks Hans for this. Updated it based your comments. I think it's difficult to use a closed form which cover all lowering cases, especially in different targets. I believe we need to differentiate each lowering case as the costs varying depending on the lowering cases and targets. junbuml: Thanks Hans for this. Updated it based your comments. I think it's difficult to use a closed…
		if (Cost > Threshold)
		return false;
		}

		haichengUnsubmitted Not Done Reply Inline Actions If the estimation chooses to use jumptable, I think we also need to add the cost of the table which is proportional to the range. haicheng: If the estimation chooses to use jumptable, I think we also need to add the cost of the table…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions I'm not sure if we really need to consider the size of table as a cost. I think just couple of instructions to look up the table and jump to actual blocks need to be considered as cost. junbuml: I'm not sure if we really need to consider the size of table as a cost. I think just couple of…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions You are right Haicheng. Looks like we need to consider the cost of the table as well. junbuml: You are right Haicheng. Looks like we need to consider the cost of the table as well.
		return false;
		hansUnsubmitted Not Done Reply Inline Actions Can we really run into INT_MAX here? Especially given the Threshold check above? hans: Can we really run into INT_MAX here? Especially given the Threshold check above?
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions In very extreme case, it could happen. For a very large Threshold (e.g., INT_MAX), even though SI.getNumCases() * InlineConstants::InstrCost is smaller than the Threshold. ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost could hit INT_MAX. junbuml: In very extreme case, it could happen. For a very large Threshold (e.g., INT_MAX), even though…
		}

		hansUnsubmitted Done Reply Inline Actions I would suggest handling the NumCaseCluster <= 3 first and returning early for that, since it's the less complicated case. hans: I would suggest handling the NumCaseCluster <= 3 first and returning early for that, since it's…
bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {		bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
// We never want to inline functions that contain an indirectbr. This is		// We never want to inline functions that contain an indirectbr. This is
		hansUnsubmitted Done Reply Inline Actions Aren't you adding `Cost` twice here? You're doing `Cost +=` and also `SwitchCost + Cost`. Oh, Cost is just a regular int; yeah then I see how it can overflow. But std::min is returnning an uint64_t here, so it seems you're still not handling the overflow? hans: Aren't you adding `Cost` twice here? You're doing `Cost +=` and also `SwitchCost + Cost`. Oh…
		hansUnsubmitted Not Done Reply Inline Actions Oh never mind, you're capping the uint64_t at INT_MAX so it shuold work. Probably still don't want to do `Cost +=` though. hans: Oh never mind, you're capping the uint64_t at INT_MAX so it shuold work. Probably still don't…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Thanks for this. Yes, it should be Cost =, instead of Cost +=. junbuml: Thanks for this. Yes, it should be Cost =, instead of Cost +=.
		hansUnsubmitted Done Reply Inline Actions Cool, makes sense now. Are the `(uint64_t)` casts strictly necessary though? SwitchCost is already `uint64_t` so I'd imagine `INT_MAX` to get promoted and everything to work out? hans: Cool, makes sense now. Are the `(uint64_t)` casts strictly necessary though? SwitchCost is…
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions We need (uint64_t)INT_MAX, but we don't need (uint64_t) Cost as SwitchCost is uint64_t. junbuml: We need (uint64_t)INT_MAX, but we don't need (uint64_t) Cost as SwitchCost is uint64_t.
// incorrect because all the blockaddress's (in static global initializers		// incorrect because all the blockaddress's (in static global initializers
// for example) would be referring to the original function, and this		// for example) would be referring to the original function, and this
// indirect jump would jump from the inlined copy of the function into the		// indirect jump would jump from the inlined copy of the function into the
// original function which is extremely undefined behavior.		// original function which is extremely undefined behavior.
// FIXME: This logic isn't really right; we can safely inline functions with		// FIXME: This logic isn't really right; we can safely inline functions with
// indirectbr's as long as no other function or global references the		// indirectbr's as long as no other function or global references the
// blockaddress of a block within the current function.		// blockaddress of a block within the current function.
HasIndirectBr = true;		HasIndirectBr = true;
▲ Show 20 Lines • Show All 586 Lines • Show Last 20 Lines

lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines

	int TargetTransformInfo::getIntrinsicCost(			int TargetTransformInfo::getIntrinsicCost(
	Intrinsic::ID IID, Type RetTy, ArrayRef<const Value > Arguments) const {			Intrinsic::ID IID, Type RetTy, ArrayRef<const Value > Arguments) const {
	int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);			int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

				int TargetTransformInfo::getEstimatedNumberOfCaseClusters(
				const SwitchInst &SI, unsigned *JTSize) const {
				return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize);
				}

	int TargetTransformInfo::getUserCost(const User *U) const {			int TargetTransformInfo::getUserCost(const User *U) const {
	int Cost = TTIImpl->getUserCost(U);			int Cost = TTIImpl->getUserCost(U);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

	bool TargetTransformInfo::hasBranchDivergence() const {			bool TargetTransformInfo::hasBranchDivergence() const {
	return TTIImpl->hasBranchDivergence();			return TTIImpl->hasBranchDivergence();
	▲ Show 20 Lines • Show All 441 Lines • Show Last 20 Lines

lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	return OptForSize \|\| TLI.getMaximumJumpTableSize() == 0
? UINT_MAX		? UINT_MAX
: TLI.getMaximumJumpTableSize();		: TLI.getMaximumJumpTableSize();
}		}

static inline unsigned getJumptableMinDensity(const bool OptForSize) {		static inline unsigned getJumptableMinDensity(const bool OptForSize) {
return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;		return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
}		}

		/// Return the estimated number of clusters. Note that the number of clusters
		/// identified in this function could be different from the actural numbers
		/// found for lowering by findClusters(). This function ignore switches that
		/// are lowered with a mix of jump table / bit test / BTree. This function was
		/// initially intended to be used when estimating the cost of switch in inline
		/// cost heuristic, but it's a generic cost model to be used in other place
		chandlercUnsubmitted Not Done Reply Inline Actions inlining and loop unrolling. it's a generic cost model. chandlerc: inlining and loop unrolling. it's a generic cost model.
		/// (e.g., in loop unrolling).
		unsigned
		SwitchCaseClusterFinder::getEstimatedNumberOfClusters(const SwitchInst &SI,
		unsigned &JumpTableSize) {
		unsigned N = SI.getNumCases();
		const bool OptForSize = SI.getParent()->getParent()->optForSize();
		const unsigned MaxJumpTableSize = getMaxJumpTableSize(OptForSize, TLI);
		bool IsJTAllowed = areJTsAllowed(TLI, &SI);
		JumpTableSize = 0;

		if (N < 1 \|\|
		(DL.getPointerSizeInBits() < MaxJumpTableSize && MaxJumpTableSize < N))
		return N;

		if (!IsJTAllowed && DL.getPointerSizeInBits() < N)
		return N;

		haichengUnsubmitted Not Done Reply Inline Actions We can start from begin()+1 haicheng: We can start from begin()+1
		junbumlAuthorUnsubmitted Not Done Reply Inline Actions Thanks. I will do that. junbuml: Thanks. I will do that.
		APInt MaxCaseVal = (SI.case_begin()).getCaseValue()->getValue();
		APInt MinCaseVal = MaxCaseVal;
		for (auto I = (SI.case_begin() + 1), E = SI.case_end(); I != E; ++I) {
		const APInt &CaseVal = I.getCaseValue()->getValue();
		if (CaseVal.sgt(MaxCaseVal))
		MaxCaseVal = CaseVal;
		if (CaseVal.slt(MinCaseVal))
		MinCaseVal = CaseVal;
		}

		// Check if suitable for a bit test
		if (N <= DL.getPointerSizeInBits()) {
		SmallPtrSet<const BasicBlock *, 4> Dests;
		for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
		Dests.insert(I.getCaseSuccessor());

		if (isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal))
		return 1;
		}

		// Check if suitable for a jump table.
		if (IsJTAllowed &&
		!isTooSmallForJumptable(N, TLI.getMinimumJumpTableEntries())) {
		const unsigned MinDensity = getJumptableMinDensity(OptForSize);
		unsigned Range =
		(MaxCaseVal - MinCaseVal).getLimitedValue(UINT64_MAX - 1) + 1;

		if (Range <= MaxJumpTableSize && ::isDense(Range, N, MinDensity)) {
		JumpTableSize = Range;
		return 1;
		}
		}
		return N;
		}

bool SwitchCaseClusterFinder::isDense(		bool SwitchCaseClusterFinder::isDense(
const CaseClusterVector &Clusters,		const CaseClusterVector &Clusters,
const SmallVectorImpl<unsigned> &TotalCases, unsigned First, unsigned Last,		const SmallVectorImpl<unsigned> &TotalCases, unsigned First, unsigned Last,
unsigned Density) const {		unsigned Density) const {
assert(Last >= First);		assert(Last >= First);
assert(TotalCases[Last] >= TotalCases[First]);		assert(TotalCases[Last] >= TotalCases[First]);

const APInt &LowCase = Clusters[First].Low->getValue();		const APInt &LowCase = Clusters[First].Low->getValue();
▲ Show 20 Lines • Show All 459 Lines • Show Last 20 Lines

test/Transforms/Inline/AArch64/switch.ll

This file was added.

				; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true \| FileCheck %s
				; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true \| FileCheck %s

				define i32 @callee_range(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 0, label %sw.bb0
				i32 1000, label %sw.bb1
				i32 2000, label %sw.bb1
				i32 3000, label %sw.bb1
				i32 4000, label %sw.bb1
				i32 5000, label %sw.bb1
				i32 6000, label %sw.bb1
				i32 7000, label %sw.bb1
				i32 8000, label %sw.bb1
				i32 9000, label %sw.bb1
				]

				sw.default:
				store volatile i32 %a, i32* %P
				br label %return
				sw.bb0:
				store volatile i32 %a, i32* %P
				br label %return
				sw.bb1:
				store volatile i32 %a, i32* %P
				br label %return
				return:
				ret i32 42
				}

				define i32 @caller_range(i32 %a, i32* %P) {
				; CHECK-LABEL: @caller_range(
				; CHECK: call i32 @callee_range
				%r = call i32 @callee_range(i32 %a, i32* %P)
				ret i32 %r
				}

				define i32 @callee_bittest(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 0, label %sw.bb0
				i32 1, label %sw.bb1
				i32 2, label %sw.bb2
				i32 3, label %sw.bb0
				i32 4, label %sw.bb1
				i32 5, label %sw.bb2
				i32 6, label %sw.bb0
				i32 7, label %sw.bb1
				i32 8, label %sw.bb2
				]

				sw.default:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb0:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb1:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb2:
				br label %return

				return:
				ret i32 42
				}


				define i32 @caller_bittest(i32 %a, i32* %P) {
				; CHECK-LABEL: @caller_bittest(
				; CHECK-NOT: call i32 @callee_bittest
				%r= call i32 @callee_bittest(i32 %a, i32* %P)
				ret i32 %r
				}

				define i32 @callee_jumptable(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 1001, label %sw.bb101
				i32 1002, label %sw.bb102
				i32 1003, label %sw.bb103
				i32 1004, label %sw.bb104
				i32 1005, label %sw.bb101
				i32 1006, label %sw.bb102
				i32 1007, label %sw.bb103
				i32 1008, label %sw.bb104
				i32 1009, label %sw.bb101
				i32 1010, label %sw.bb102
				i32 1011, label %sw.bb103
				i32 1012, label %sw.bb104
				]

				sw.default:
				br label %return

				sw.bb101:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb102:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb103:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb104:
				store volatile i32 %a, i32* %P
				br label %return

				return:
				ret i32 42
				}

				define i32 @caller_jumptable(i32 %a, i32 %b, i32* %P) {
				; CHECK-LABEL: @caller_jumptable(
				; CHECK: call i32 @callee_jumptable
				%r = call i32 @callee_jumptable(i32 %b, i32* %P)
				ret i32 %r
				}

This is an archive of the discontinued LLVM Phabricator instance.

[InlineCost] Increase the cost of SwitchClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 94880

include/llvm/Analysis/TargetTransformInfo.h

include/llvm/Analysis/TargetTransformInfoImpl.h

include/llvm/CodeGen/BasicTTIImpl.h

include/llvm/CodeGen/SwitchCaseCluster.h

lib/Analysis/InlineCost.cpp

lib/Analysis/TargetTransformInfo.cpp

lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp

test/Transforms/Inline/AArch64/switch.ll

[InlineCost] Increase the cost of Switch
ClosedPublic