This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
include/llvm/
-
llvm/
-
Analysis/
-
TargetTransformInfo.h
-
TargetTransformInfoImpl.h
-
CodeGen/
-
BasicTTIImpl.h
-
Target/
-
TargetLowering.h
-
lib/
-
Analysis/
-
InlineCost.cpp
-
TargetTransformInfo.cpp
-
CodeGen/
-
SelectionDAG/
-
SelectionDAGBuilder.h
-
SelectionDAGBuilder.cpp
-
TargetLoweringBase.cpp
-
test/Transforms/Inline/AArch64/
-
Transforms/
-
Inline/
-
AArch64/
-
switch.ll

Differential D31085

[InlineCost] Increase the cost of Switch
ClosedPublic

Authored by junbuml on Mar 17 2017, 10:10 AM.

Download Raw Diff

Details

Reviewers

hans
bmakam
chandlerc
eraman
haicheng
mcrosier

Commits

rG919f9e8d65ad: [InlineCost] Improve the cost heuristic for Switch
rL301649: [InlineCost] Improve the cost heuristic for Switch

Summary

The motivation example is like below which has 13 cases but only 2 distinct targets

lor.lhs.false2:                                   ; preds = %if.then
  switch i32 %Status, label %if.then27 [
    i32 -7012, label %if.end35
    i32 -10008, label %if.end35
    i32 -10016, label %if.end35
    i32 15000, label %if.end35
    i32 14013, label %if.end35
    i32 10114, label %if.end35
    i32 10107, label %if.end35
    i32 10105, label %if.end35
    i32 10013, label %if.end35
    i32 10011, label %if.end35
    i32 7008, label %if.end35
    i32 7007, label %if.end35
    i32 5002, label %if.end35
  ]

which is compiled into a balanced binary tree like this on AArch64 (similar on X86)

.LBB853_9:                              // %lor.lhs.false2
        mov     w8, #10012
        cmp             w19, w8
        b.gt    .LBB853_14
// BB#10:                               // %lor.lhs.false2
        mov     w8, #5001
        cmp             w19, w8
        b.gt    .LBB853_18
// BB#11:                               // %lor.lhs.false2
        mov     w8, #-10016
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#12:                               // %lor.lhs.false2
        mov     w8, #-10008
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#13:                               // %lor.lhs.false2
        mov     w8, #-7012
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_14:                             // %lor.lhs.false2
        mov     w8, #14012
        cmp             w19, w8
        b.gt    .LBB853_21
// BB#15:                               // %lor.lhs.false2
        mov     w8, #-10105
        add             w8, w19, w8
        cmp             w8, #9          // =9
        b.hi    .LBB853_17
// BB#16:                               // %lor.lhs.false2
        orr     w9, wzr, #0x1
        lsl     w8, w9, w8
        mov     w9, #517
        and             w8, w8, w9
        cbnz    w8, .LBB853_23
.LBB853_17:                             // %lor.lhs.false2
        mov     w8, #10013
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_18:                             // %lor.lhs.false2
        mov     w8, #-7007
        add             w8, w19, w8
        cmp             w8, #2          // =2
        b.lo    .LBB853_23
// BB#19:                               // %lor.lhs.false2
        mov     w8, #5002
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#20:                               // %lor.lhs.false2
        mov     w8, #10011
        cmp             w19, w8
        b.eq    .LBB853_23
        b       .LBB853_3
.LBB853_21:                             // %lor.lhs.false2
        mov     w8, #14013
        cmp             w19, w8
        b.eq    .LBB853_23
// BB#22:                               // %lor.lhs.false2
        mov     w8, #15000
        cmp             w19, w8
        b.ne    .LBB853_3

However, the inline cost model estimates the cost to be linear with the number of distinct targets and the cost of the above switch is just 2 InstrCosts. The function containing this switch is then inlined about 900 times.

This change modifies the model to be linear with the size of the balanced binary tree.

Diff Detail

Repository: rL LLVM

Event Timeline

junbuml created this revision.Mar 17 2017, 10:10 AM

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptMar 17 2017, 10:10 AM

junbuml mentioned this in D29870: [InlineCost] Increase the cost of Switch.Mar 17 2017, 10:15 AM

Note that this change was originally written by @haicheng in D29870 and I updated his change by using a new TTI hook to get the number of case cluster based on D31080.

I can still see the +8% in performance and -7.63% in size in spec2000/vortex. No significant performance and size change in other benchmarks in spec2000/spec2006.

Can you please also try this on top of the changes from D30333? That's changing one switch transformation to happen much later, making it more sensitive to inline cost estimates.

Can you please also try this on top of the changes from D30333? That's changing one switch transformation to happen much later, making it more sensitive to inline cost estimates.

Thanks I will try this !

junbuml added a reviewer: hans.Mar 20 2017, 11:11 AM

While I'm working on the accurate version of case cluster calculation through the TTI hook in this change based on D31080, I also want to see if it make sense to apply different level of approximations depending on the number of case? So, for example, if the number of case is more than some threshold, we can use a very rough approximation (e.g, use the number case as number of cluster like D29870). Otherwise, we can use a somewhat accurate version of cluster calculation based on D31080.

junbuml mentioned this in D31080: [DAG] Extract switch lowering as a spearate object NFC.Mar 29 2017, 10:51 AM

The below comment by Hans is copied from D31080 :

Before we get any further, I also would like to ask if you have done any measurements of compile-time with this set of patches. As I said before, I think this be quite an expensive hook to call for the inline cost analysis, and it would be nice to see some numbers. If it turns out that it is expensive, perhaps we could come up with some better inline cost heuristic, perhaps something based on the density of the switch.

I think there are three difference cost heuristics :

The cheapest yet inaccurate version is D29870 in which the number of case is simply used as number of cluster.
The most accurate yet expensive version is to use a hook from D31080.
I guess what Hans suggested in above comment (something based on the density of the switch) must be somewhere in between #1 and #2.

Both #1 and #2 consider forming a BTree, and don't require to update the cost heuristic for the changes in switch lowering. I'm not clear about #3 in terms of the accuracy, compile-time, and maintenance. Hans, can you give me little bit more detail?

For me, at least #1 is still better than the current heuristic which simply count the number of distinct successor blocks. If then, would it make sense to use #1 by default and add difference levels of cost heuristics with flags so that we can come up with the most reasonable heuristic and allow others to do experiments ?

In D31085#713391, @junbuml wrote:

The below comment by Hans is copied from D31080 :

Before we get any further, I also would like to ask if you have done any measurements of compile-time with this set of patches. As I said before, I think this be quite an expensive hook to call for the inline cost analysis, and it would be nice to see some numbers. If it turns out that it is expensive, perhaps we could come up with some better inline cost heuristic, perhaps something based on the density of the switch.

I think there are three difference cost heuristics :

The cheapest yet inaccurate version is D29870 in which the number of case is simply used as number of cluster.

The most accurate yet expensive version is to use a hook from D31080.

I guess what Hans suggested in above comment (something based on the density of the switch) must be somewhere in between #1 and #2.

Both #1 and #2 consider forming a BTree, and don't require to update the cost heuristic for the changes in switch lowering. I'm not clear about #3 in terms of the accuracy, compile-time, and maintenance. Hans, can you give me little bit more detail?

For me, at least #1 is still better than the current heuristic which simply count the number of distinct successor blocks. If then, would it make sense to use #1 by default and add difference levels of cost heuristics with flags so that we can come up with the most reasonable heuristic and allow others to do experiments ?

Yes, counting the number of successor blocks doesn't seem right.

I think a decent heuristic might be:

Ask TTI about the native word width, and check if this switch is trivially lowered with a bit test (two successors, case range fits in a machine word); this is very cheap
Ask TTI about jump table density conditions; if the whole switch is dense enough, assume it's a jump table and compute a cost based on that
Otherwise assume it's a balanced tree and estimate the cost based on number of cases (it would be nice not to actually have to build the tree, but just compute an estimate cost using some formula)

This means the estimate will be off for switches that are lowered with a mix of jump tables, binary trees and bit-tests, but those aren't that common, and not being completely accurate is probably fine.

I think going with only #1 might not be so good because we'll overestimate the cost of many switches. But I think computing 1-3 would make for a good a cheap heuristic.

If we are not worry about the mix of cases, I think this is reasonable cheap to be used.

Chandler, do you agree with the heuristic Hans suggested above? Even though it do not cover switches that are lowered with a mix of jump table/bit test/BTree, I think this is reasonable compromise between accuracy and cost of the hook.

Based on Hans' suggestion, checked if switch is suitable for either bit test or jump table. If not suitable for both, use BTree. Please take a look and let me know any comment.

haicheng added inline comments.Apr 10 2017, 9:13 AM

include/llvm/CodeGen/SwitchCaseCluster.h
123 ↗	(On Diff #94559)	Return
lib/Analysis/InlineCost.cpp
1006 ↗	(On Diff #94559)	I think we can exit early if the number of cases is too large.
1015 ↗	(On Diff #94559)	If the estimation chooses to use jumptable, I think we also need to add the cost of the table which is proportional to the range.
lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
86 ↗	(On Diff #94559)	We can start from begin()+1
test/Transforms/Inline/switch.ll
3 ↗	(On Diff #94559)	We may need to add tests for jump table and bit test.

junbuml added inline comments.Apr 10 2017, 12:37 PM

lib/Analysis/InlineCost.cpp
1006 ↗	(On Diff #94559)	In SwitchCaseClusterFinder::getEstimatedNumberOfCluster(), we have early exit for a large number of cases. But I guess you mean something else. Can you specify little bit more about the "too large".
1015 ↗	(On Diff #94559)	I'm not sure if we really need to consider the size of table as a cost. I think just couple of instructions to look up the table and jump to actual blocks need to be considered as cost.
lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
86 ↗	(On Diff #94559)	Thanks. I will do that.
test/Transforms/Inline/switch.ll
3 ↗	(On Diff #94559)	Yes. I will do that.

junbuml added inline comments.Apr 10 2017, 1:48 PM

lib/Analysis/InlineCost.cpp
1015 ↗	(On Diff #94559)	You are right Haicheng. Looks like we need to consider the cost of the table as well.

haicheng added inline comments.Apr 10 2017, 2:18 PM

lib/Analysis/InlineCost.cpp
1006 ↗	(On Diff #94559)	One case needs at least one instruction. So if cost + numcases * instrcost > threshold, we can exit early.

In D31085#715294, @junbuml wrote:

Chandler, do you agree with the heuristic Hans suggested above? Even though it do not cover switches that are lowered with a mix of jump table/bit test/BTree, I think this is reasonable compromise between accuracy and cost of the hook.

Yes, I like this model.

lib/CodeGen/SelectionDAG/SwitchCaseCluster.cpp
68–69 ↗	(On Diff #94559)	inlining and loop unrolling. it's a generic cost model.

This really looks like it is going in the right direction. I'm going to work on reviewing some of teh code changes a bit more closely, but I wanted to mention one other thing.

This seems like a really good change to the inlining cost model, but it also seems likely to be a pretty big change. I think it is important to collect some benchmark data to make sure we're not going to uncover a significant regression by surprise. At the very least, I think running the LLVM test suite would be a good start and identifying:

How many benchmarks change
For the ones that change, what is the codesize impact
For the ones that change, what is the runtime impact

For #2 and #3 you probably want at least '-O2', but maybe also '-O3' and '-Os'.

It may be useful to ask others to benchmark other applications and/or various architectures as well. To facilitate that, I might suggest putting the code for this in under a flag that is off and then soliciting benchmark data on llvm-dev with the flag, and based on that data, enable the flag. But if this doesn't fire too often in the test suite, or the results are particularly good, might be easy to just try it and see.

Thanks again for working on this!

Addressed Haicheng's comments and added a flag as Chandler asked. With this update, I kicked off performance tests for the llvm test suite, spec2000, spec2006 in aarch64, but I will be able to share organized data early next as I will be out of office rest of the week. Please let me know any comment.

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

Sure, I can make InlineCost work with just TLI, but I don't want to duplicate the same code for InlineCost from lowering. So I will refactor just a little bit on SelectionDAGBuilder to expose some util functions.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

Can you give me little bit more details about your concern ?

In D31085#728192, @junbuml wrote:

I was hoping we wouldn't need the refactoring to SwitchCaseCluster.cpp, and that InlineCost.cpp could just work with TLI to check whether jump tables are allowed, density requirements etc.

Sure, I can make InlineCost work with just TLI, but I don't want to duplicate the same code for InlineCost from lowering. So I will refactor just a little bit on SelectionDAGBuilder to expose some util functions.

That sounds good to me.

My concern is that separating the case cluster code from SelectionDAGBuilder might be more trouble than it's worth.

Can you give me little bit more details about your concern ?

The concern is just that the switch lowering code is fairly tightly integrated with SelectionDAGBuilder, and pulling it out isn't worth the effort if all we need from TLI is some basic info, like if the whole switch is dense enough for a lookup table.

Addressed Hans' comments; now the switch cost heuristic just work with TLI.

Please see list of benchmarks changed in LLVM test suite and spec2000/2006. No significant code size regression was found in any config. Overall minor positive impact on code size in LLVM test suite, but in O3 with LTO, there was -7.9% reduce in code size in spec2000/vortex.

In AArch64, I didn't see any clear performance impact in LLVM test suite, but in O3 with LTO, I observed +17.82% performance improvement in spec2000/vertex.

O2 :

Benchmarks	Code size (- is better)
MultiSource/Applications/siod	-0.061%
MultiSource/Applications/hbd	0.000%
MultiSource/Applications/JM/lencod/lencod	-0.070%
MultiSource/Applications/JM/ldecod/ldecod	0.000%
MultiSource/Applications/lua/lua	0.001%
MultiSource/Applications/d/make_dparser	-0.081%
MultiSource/Applications/sqlite3/sqlite3	-0.217%
MultiSource/Benchmarks/Prolangs-C/bison/mybison	0.000%
MultiSource/Benchmarks/MiBench/consumer-typeset/consumer-typeset	0.000%
MultiSource/Benchmarks/MiBench/consumer-jpeg/consumer-jpeg	-0.082%
MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg	0.000%
MultiSource/Benchmarks/MallocBench/gs/gs	0.000%
spec2000/perlbmk	-0.066%
spec2000/gcc	-0.183%
spec2000/parser	0.000%
spec2000/crafty	-0.080%
spec2000/mesa	-0.286%
spec2006/soplex	0.000%
spec2006/xalancbmk	0.000%
spec2006/hmmer	-0.072%
spec2006/gcc	-0.191%
spec2006/h264ref	0.000%
spec2006/povray	-0.057%
spec2006/perlbench	-0.060%

Os :

Benchmarks	Code size(- is better)
MultiSource/Applications/siod/siod	-0.123%
MultiSource/Applications/JM/lencod/lencod	0.000%
MultiSource/Applications/lua/lua	0.000%
MultiSource/Applications/sqlite3/sqlite3	0.001%
MultiSource/Benchmarks/mafft/pairlocalalign	0.000%
MultiSource/Benchmarks/7zip/7zip-benchmark	0.001%
spec2000/perlbmk	0.000%
spec2000/gap	0.000%
spec2000/gcc	0.000%
spec2000/mesa	-0.071%
spec2000/vortex	0.000%
spec2006/gobmk	0.000%
spec2006/xalancbmk	0.000%
spec2006/hmmer	0.000%
spec2006/gcc	-0.048%
spec2006/omnetpp	0.000%
spec2006/h264ref	0.000%
spec2006/perlbench	-0.062%

O3 :

Benchmarks	Code size(- is better)
MultiSource/Applications/kimwitu++/kc	0.000%
MultiSource/Applications/siod/siod	-0.061%
MultiSource/Applications/hbd/hbd	0.000%
MultiSource/Applications/JM/lencod/lencod	0.000%
MultiSource/Applications/lua/lua	0.001%
MultiSource/Applications/SIBsim4/SIBsim4	0.001%
MultiSource/Applications/d/make_dparser	0.000%
MultiSource/Applications/sqlite3/sqlite3	-0.212%
MultiSource/Benchmarks/mediabench/jpeg/jpeg-6a/cjpeg	0.000%
MultiSource/Benchmarks/MallocBench/gs/gs	0.000%
MultiSource/Benchmarks/7zip/7zip-benchmark	0.000%
spec2000/perlbmk	-0.066%
spec2000/gcc	-0.182%
spec2000/mesa	-0.356%
spec2000/vortex	0.000%
spec2006/soplex	0.000%
spec2006/xalancbmk	-0.019%
spec2006/bzip2	0.000%
spec2006/hmmer	0.000%
spec2006/gcc	-0.188%
spec2006/h264ref	0.000%
spec2006/povray	-0.056%
spec2006/perlbench	-0.121%

Spec2000/2006 performance in O3 :

Benchmarks	Score(+ is better)
spec2000/perlbmk	+2.706%
spec2000/vortex	+2.028%
spec2000/mesa	-1.76%
spec2006/soplex	-1.334%
spec2006/povray	+1.509%
spec2006/perlbench	+0.757%

Spec2000/2006 performance in O3 with LTO :

Benchmarks	Score(+ is better)
spec2000/gzip	+2.180%
spec2000/mesa	-4.093%
spec2000/vortex	+17.822%

It's starting to look much simpler, which is great.

include/llvm/Analysis/TargetTransformInfo.h
773 ↗	(On Diff #95777)	The other function here return an estimated cost for lowering. I think that would be a better interface for this too.
include/llvm/CodeGen/BasicTTIImpl.h
175 ↗	(On Diff #95777)	I wish this could be much simpler. Maybe most of the code could defer to TLI::isSuitableForBitTest / isSuitableForJumpTable which could also be used from the DAG code.
225 ↗	(On Diff #95777)	If we have TLI::isSuitableForBitTests, maybe we should have isSuitableForJumpTable too, that way we don't have to duplicate as much logic, and that one could do what areJTsAllowed() as well.
lib/Analysis/InlineCost.cpp
1049 ↗	(On Diff #95777)	If the whole switch is suitable for a jump table, can't we just return after this?
1051 ↗	(On Diff #95777)	Why do we have to do this? The code is basically constructing the tree and throwing it away. It should be possible to compute an estimate for the size of the tree with a closed-form mathematical expression.

haicheng added inline comments.Apr 20 2017, 6:37 AM

lib/Analysis/InlineCost.cpp
1051 ↗	(On Diff #95777)	If n is the case number, f(n) is the mapping from the case number to the node number of BTree. The recursion is f(n) = 1 + f(n/2) + f (n - n/2), when n > 3. So, f(n) is between n + 2^(log2(n) - 1) - 1 and n + 2^(log2(n)) - 1. The lower bound is about 1.5n - 1 and the upper bound is about 2n - 1

haicheng added inline comments.Apr 20 2017, 8:48 AM

lib/Analysis/InlineCost.cpp
1051 ↗	(On Diff #95777)	The exact equation is f(n) = n, n <= 3 f(n) = n + 2^(log(n) - 1) - 1, n > 3 && 2^log(n) <= n <= 1.52^(log(n)) f(n) = 2n - 2^(log(n)) - 1, n > 3 && 1.52^(log(n)) < n < 2^(log(n)+1)

junbuml updated this revision to Diff 96252.Apr 21 2017, 3:31 PM

junbuml added inline comments.

include/llvm/Analysis/TargetTransformInfo.h
773 ↗	(On Diff #95777)	I doubt if this is good place to get the inline cost because other functions handle user costs which is different from inline cost. Mixing the user cost and inline cost here might be a bad choice. I believe the inline cost should be decided in InlineCost.
include/llvm/CodeGen/BasicTTIImpl.h
175 ↗	(On Diff #95777)	Tried to make it simpler, but we still need to find Min/MaxValue here without forming CaseClusters and it's also good to check IsJTAllowed early before doing the actual suitability check to avoid iterating the for loop to find Min/MaxValue in case not allowed. Please take a look and let me know if there is any part you want to move in either isSuitableForBitTest or isSuitableForJumpTable.
225 ↗	(On Diff #95777)	Added isSuitableForJumpTable in TLI which used in here and DAG, but keep areJTsAllowed() as a separate function because areJTsAllowed() need to be checked only once in findJumpTable() in DAG, and we can also hit the early exit in this function when JT is not allowed.
lib/Analysis/InlineCost.cpp
1051 ↗	(On Diff #95777)	Thanks Haicheng for this. If n is a power of 2, the number of node should be n + 2^(log2(n) - 1) - 1. For non-power of 2 cases , the lower bound is n + 2^(floor(log2(n)) - 1) - 1 and the upper bound is n + 2^(ceiling(log2(n)) - 1) - 1. As a estimation, I think the use of upper bound is simple and conservative enough.

hans added inline comments.Apr 21 2017, 4:35 PM

include/llvm/Analysis/TargetTransformInfo.h
773 ↗	(On Diff #95777)	I was thinking it doesn't need to be used as the inlining cost directly, but exposing some metric the inline cost model could then make use of - that's kind of what we're doing anyway. What I don't like about this one is that it feels a little out of place compared to other functions which return an estimate of the cost of lowering an instruction, whereas this seems to return details about the lowering instead. I'm not sure how to make this better though.
include/llvm/Target/TargetLowering.h
766 ↗	(On Diff #96252)	NumCluster -> NumClusters please
lib/Analysis/InlineCost.cpp
1035 ↗	(On Diff #96252)	I'm not sure it's a BTree, but rather a regular binary search tree.
1048 ↗	(On Diff #96252)	Very nice! I think the code can be simplified though, and this will allow us to avoid the rounding from Log2_64_Ceil: (N = NumCaseCluster) `NumberOfNonLeaf = 2^(log2(N) - 1) - 1 = 2^log2(N) * 2^-1 - 1 = N/2 - 1` Adding NumCaseCluster to that yields `N + N / 2 - 1 = N * 3 / 2 - 1`. I kind of wish this is all there was to this change and we didn't need all that other code: quick check if the switch is suitable for a bit test or jump table, otherwise compute a cost based on `N * 3 / 2 - 1`. I don't know if it's possible though.
1053 ↗	(On Diff #96252)	Can we really run into INT_MAX here? Especially given the Threshold check above?
1055 ↗	(On Diff #96252)	I would suggest handling the NumCaseCluster <= 3 first and returning early for that, since it's the less complicated case.

Addressed Hans' comments.

include/llvm/Analysis/TargetTransformInfo.h
773 ↗	(On Diff #95777)	I see what you meant and I can see functions which return the lowering cost. However, this file also hold functions which expose details about the machine directly to be used in IR-level. For me it seems that exposing this information from TLI to InlineCost still keep the original intention of this class. I will be happy to hear any better suggestion about the way of exposing this lowering information.
lib/Analysis/InlineCost.cpp
1048 ↗	(On Diff #96252)	Thanks Hans for this. Updated it based your comments. I think it's difficult to use a closed form which cover all lowering cases, especially in different targets. I believe we need to differentiate each lowering case as the costs varying depending on the lowering cases and targets.
1053 ↗	(On Diff #96252)	In very extreme case, it could happen. For a very large Threshold (e.g., INT_MAX), even though SI.getNumCases() * InlineConstants::InstrCost is smaller than the Threshold. ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost could hit INT_MAX.

Apologies for the reviews dragging out.

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.

The smaller you can make this change, the easier it will be to get this committed.

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	This check should probably be done in `isSuitableForJumpTable`. There should be no need to pass NumClusters to `areJTsAllowed` since whether jump tables are allowed only depends on the target and function. The switch instruction also shouldn't be passed in, just the function.
779 ↗	(On Diff #96464)	This is used in isSuitableForBitTests, but does it really need to be exposed in the TargetLowering interface?
792 ↗	(On Diff #96464)	What's the difference between Range and JumpTableSize? As far as I can tell, those are always the same.
lib/Analysis/InlineCost.cpp
1057 ↗	(On Diff #96464)	Aren't you adding `Cost` twice here? You're doing `Cost +=` and also `SwitchCost + Cost`. Oh, Cost is just a regular int; yeah then I see how it can overflow. But std::min is returnning an uint64_t here, so it seems you're still not handling the overflow?

hans added inline comments.Apr 25 2017, 11:00 AM

lib/Analysis/InlineCost.cpp
1057 ↗	(On Diff #96464)	Oh never mind, you're capping the uint64_t at INT_MAX so it shuold work. Probably still don't want to do `Cost +=` though.

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.
The smaller you can make this change, the easier it will be to get this committed.

For me, it seems that the code change in DAG and TLI only make sense when reviewed together with the changes in InlineCost. That's why I put them together. If you generally agree with the change in DAG side code, I can break it as a separate patch, and leave only inliner side change in here.

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	Yes, I agree that areJTsAllowed() should see only function and target, so I just pass a Function to areJTsAllowed(). But I don't think isSuitableForJumpTable is good place for this check because this check is for the whole clusters of a switch, but isSuitableForJumpTable should see a set of clusters, not necessary the whole clusters, especially when we try to build jump tables for split clusters in DAG. We also need to do an early exit when this check hit. So I do this check in findJumpTables() in DAG and getEstimatedNumberOfCaseClusters() in TTI.
779 ↗	(On Diff #96464)	This is also used in findBitTestClusters and buildBitTests(), and for me doing this check through TLI doesn't seem to be weird. Please let me know if you see any better place for this function.
792 ↗	(On Diff #96464)	Yes, these are the same.
lib/Analysis/InlineCost.cpp
1057 ↗	(On Diff #96464)	Thanks for this. Yes, it should be Cost =, instead of Cost +=.

In D31085#737231, @junbuml wrote:

I think we've got the right idea with computing inline cost based on an estimate of the number of nodes in binary search tree, but the patch is still making a lot of changes all over the place making it hard to review.
The smaller you can make this change, the easier it will be to get this committed.

For me, it seems that the code change in DAG and TLI only make sense when reviewed together with the changes in InlineCost. That's why I put them together. If you generally agree with the change in DAG side code, I can break it as a separate patch, and leave only inliner side change in here.

Yes, reviewing them together makes total sense. I just wish the change we're making were smaller.

Anyway, I think this is really close now. Each time I read through it, it looks better :-)

include/llvm/CodeGen/BasicTTIImpl.h
194 ↗	(On Diff #96614)	Declaring CI and starting to increment it outside the for-loop is a little unusual. I realize this is to avoid repeating the first iteration (maybe I wrote this somewhere?), but I think it would be better if this were written as a straight-forward loop: APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); APInt MinCaseVal = SI.case_begin()->getCaseValue()->getValue(); for (auto CI : SI.cases()) { ... }
include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	isSuitableForJumpTable() seems like a good place for this to me. It should be able to handle the whole switch range or a subset in the same way.
lib/Analysis/InlineCost.cpp
1018 ↗	(On Diff #96614)	Could this overflow?
1057 ↗	(On Diff #96464)	Cool, makes sense now. Are the `(uint64_t)` casts strictly necessary though? SwitchCost is already `uint64_t` so I'd imagine `INT_MAX` to get promoted and everything to work out?

junbuml added inline comments.Apr 26 2017, 10:34 AM

include/llvm/CodeGen/BasicTTIImpl.h
194 ↗	(On Diff #96614)	Yes, I like this. I will do that.
include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	If we use range for this check in isSuitableForJumpTable(), it will change the behavior in lowering. Current lowering code use the total number of cluster (N) for this check only once at the beginning of findJumpTables(), which is different from both Range and NumCases. And also when partitioning clusters for jump tables, doing this check in isSuitableForJumpTable() may change the behavior of the loop finding partitions. If either Range or NumCases should be used for this check, I think it should be a separate patch.

hans added inline comments.Apr 26 2017, 11:15 AM

include/llvm/Target/TargetLowering.h
771 ↗	(On Diff #96464)	Oh, I see what you mean. It's annoying that we'll end up with a `isSuitableForJumpTable()` that doesn't take the minimum size into account though :-/

junbuml updated this revision to Diff 96799.Apr 26 2017, 11:55 AM

junbuml marked 3 inline comments as done.

junbuml added inline comments.

lib/Analysis/InlineCost.cpp
1057 ↗	(On Diff #96464)	We need (uint64_t)INT_MAX, but we don't need (uint64_t) Cost as SwitchCost is uint64_t.

hans added inline comments.Apr 26 2017, 2:04 PM

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	It still seems backward that this is checking density and maximum table size, but that the minimum size is expected to be checked elsewhere :-/ There must be some better way to factor this.

junbuml added inline comments.Apr 26 2017, 2:30 PM

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	I agree that doing minSize check in isSuitableForJumpTable looks clear. However, I want to keep the changes in lowering side in this patch is NFC. That's why we cannot do this check in isSuitableForJumpTable(). If you want to add MinSize check in isSuitableForJumpTable(), I think we should also pass the cluster size (N) to isSuitableForJumpTable() in findJumpTable() because Range and NumCases could be different from the cluster size(N). Only way I can think of doing this check in isSuitableForJumpTable() is : pass the cluster size (N) to isSuitableForJumpTable() for the first call of isSuitableForJumpTable() in findJumpTable() because this is for the whole range. Since the second calls of isSuitableForJumpTable() in findJumpTable for partitionized clusters should not check MinSize with sub clusters, we do not use isSuitableForJumpTable(), instead checking the max size and density separately just like current code. However, I think this way is even worse than my current code. Please let me know your thought.

lgtm

include/llvm/Target/TargetLowering.h
787 ↗	(On Diff #96799)	Let's put a FIXME in isSuitableForJumpTable that it would be nice if the min size check could somehow be combined with the other checks here.

This revision is now accepted and ready to land.Apr 26 2017, 2:48 PM

Added FIXME as Hans suggested.
I will commit it if there is any further comment from other reviewers by tomorrow. I will also post a follow-up patch to enable the flag (-inline-generic-switch-cost).

Thank you very much for the review.

Closed by commit rL301649: [InlineCost] Improve the cost heuristic for Switch (authored by junbuml). · Explain WhyApr 28 2017, 9:17 AM

This revision was automatically updated to reflect the committed changes.

tejohnson mentioned this in D67716: [Inliner] Remove incorrect early exit during switch cost computation.Sep 18 2019, 9:25 AM

tejohnson mentioned this in rL372440: [Inliner] Remove incorrect early exit during switch cost computation.Sep 20 2019, 4:28 PM

tejohnson mentioned this in rG2f32e5d84d34: [Inliner] Remove incorrect early exit during switch cost computation.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

Analysis/

TargetTransformInfo.h

12 lines

TargetTransformInfoImpl.h

6 lines

CodeGen/

BasicTTIImpl.h

56 lines

Target/

TargetLowering.h

71 lines

lib/

Analysis/

InlineCost.cpp

76 lines

TargetTransformInfo.cpp

6 lines

CodeGen/

SelectionDAG/

SelectionDAGBuilder.h

19 lines

SelectionDAGBuilder.cpp

135 lines

TargetLoweringBase.cpp

16 lines

test/

Transforms/

Inline/

AArch64/

switch.ll

123 lines

Diff 97111

llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 191 Lines • ▼ Show 20 Lines	int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys) const;		ArrayRef<Type *> ParamTys) const;

/// \brief Estimate the cost of an intrinsic when lowered.		/// \brief Estimate the cost of an intrinsic when lowered.
///		///
/// Mirrors the \c getCallCost method but uses an intrinsic identifier.		/// Mirrors the \c getCallCost method but uses an intrinsic identifier.
int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) const;		ArrayRef<const Value *> Arguments) const;

		/// \return The estimated number of case clusters when lowering \p 'SI'.
		/// \p JTSize Set a jump table size only when \p SI is suitable for a jump
		/// table.
		unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned &JTSize) const;

/// \brief Estimate the cost of a given IR user when lowered.		/// \brief Estimate the cost of a given IR user when lowered.
///		///
/// This can estimate the cost of either a ConstantExpr or Instruction when		/// This can estimate the cost of either a ConstantExpr or Instruction when
/// lowered. It has two primary advantages over the \c getOperationCost and		/// lowered. It has two primary advantages over the \c getOperationCost and
/// \c getGEPCost above, and one significant disadvantage: it can only be		/// \c getGEPCost above, and one significant disadvantage: it can only be
/// used when the IR construct has already been formed.		/// used when the IR construct has already been formed.
///		///
/// The advantages are that it can inspect the SSA use graph to reason more		/// The advantages are that it can inspect the SSA use graph to reason more
▲ Show 20 Lines • Show All 551 Lines • ▼ Show 20 Lines	public:
virtual int getCallCost(const Function *F, int NumArgs) = 0;		virtual int getCallCost(const Function *F, int NumArgs) = 0;
virtual int getCallCost(const Function *F,		virtual int getCallCost(const Function *F,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
virtual unsigned getInliningThresholdMultiplier() = 0;		virtual unsigned getInliningThresholdMultiplier() = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<Type *> ParamTys) = 0;		ArrayRef<Type *> ParamTys) = 0;
virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,		virtual int getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
ArrayRef<const Value *> Arguments) = 0;		ArrayRef<const Value *> Arguments) = 0;
		virtual unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned &JTSize) = 0;
virtual int getUserCost(const User *U) = 0;		virtual int getUserCost(const User *U) = 0;
virtual bool hasBranchDivergence() = 0;		virtual bool hasBranchDivergence() = 0;
virtual bool isSourceOfDivergence(const Value *V) = 0;		virtual bool isSourceOfDivergence(const Value *V) = 0;
virtual unsigned getFlatAddressSpace() = 0;		virtual unsigned getFlatAddressSpace() = 0;
virtual bool isLoweredToCall(const Function *F) = 0;		virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;		virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;		virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;		virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
▲ Show 20 Lines • Show All 287 Lines • ▼ Show 20 Lines	unsigned getMinPrefetchStride() override {
return Impl.getMinPrefetchStride();		return Impl.getMinPrefetchStride();
}		}
unsigned getMaxPrefetchIterationsAhead() override {		unsigned getMaxPrefetchIterationsAhead() override {
return Impl.getMaxPrefetchIterationsAhead();		return Impl.getMaxPrefetchIterationsAhead();
}		}
unsigned getMaxInterleaveFactor(unsigned VF) override {		unsigned getMaxInterleaveFactor(unsigned VF) override {
return Impl.getMaxInterleaveFactor(VF);		return Impl.getMaxInterleaveFactor(VF);
}		}
		unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned &JTSize) override {
		return Impl.getEstimatedNumberOfCaseClusters(SI, JTSize);
		}
unsigned		unsigned
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,		getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
OperandValueKind Opd2Info,		OperandValueKind Opd2Info,
OperandValueProperties Opd1PropInfo,		OperandValueProperties Opd1PropInfo,
OperandValueProperties Opd2PropInfo,		OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args) override {		ArrayRef<const Value *> Args) override {
return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,		return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args);		Opd1PropInfo, Opd2PropInfo, Args);
▲ Show 20 Lines • Show All 215 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	int getGEPCost(Type PointeeType, const Value Ptr,
// into their uses via addressing modes.		// into their uses via addressing modes.
for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)		for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
if (!isa<Constant>(Operands[Idx]))		if (!isa<Constant>(Operands[Idx]))
return TTI::TCC_Basic;		return TTI::TCC_Basic;

return TTI::TCC_Free;		return TTI::TCC_Free;
}		}

		unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned &JTSize) {
		JTSize = 0;
		return SI.getNumCases();
		}

unsigned getCallCost(FunctionType *FTy, int NumArgs) {		unsigned getCallCost(FunctionType *FTy, int NumArgs) {
assert(FTy && "FunctionType must be provided to this routine.");		assert(FTy && "FunctionType must be provided to this routine.");

// The target-independent implementation just measures the size of the		// The target-independent implementation just measures the size of the
// function by approximating that each argument will take on average one		// function by approximating that each argument will take on average one
// instruction to prepare.		// instruction to prepare.

if (NumArgs < 0)		if (NumArgs < 0)
▲ Show 20 Lines • Show All 561 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 165 Lines • ▼ Show 20 Lines	if (IID == Intrinsic::ctlz) {
if (getTLI()->isCheapToSpeculateCtlz())		if (getTLI()->isCheapToSpeculateCtlz())
return TargetTransformInfo::TCC_Basic;		return TargetTransformInfo::TCC_Basic;
return TargetTransformInfo::TCC_Expensive;		return TargetTransformInfo::TCC_Expensive;
}		}

return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);		return BaseT::getIntrinsicCost(IID, RetTy, ParamTys);
}		}

		unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
		unsigned &JumpTableSize) {
		/// Try to find the estimated number of clusters. Note that the number of
		/// clusters identified in this function could be different from the actural
		/// numbers found in lowering. This function ignore switches that are
		/// lowered with a mix of jump table / bit test / BTree. This function was
		/// initially intended to be used when estimating the cost of switch in
		/// inline cost heuristic, but it's a generic cost model to be used in other
		/// places (e.g., in loop unrolling).
		unsigned N = SI.getNumCases();
		const TargetLoweringBase *TLI = getTLI();
		const DataLayout &DL = this->getDataLayout();

		JumpTableSize = 0;
		bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());

		// Early exit if both a jump table and bit test are not allowed.
		if (N < 1 \|\| (!IsJTAllowed && DL.getPointerSizeInBits() < N))
		return N;

		APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
		APInt MinCaseVal = MaxCaseVal;
		for (auto CI : SI.cases()) {
		const APInt &CaseVal = CI.getCaseValue()->getValue();
		if (CaseVal.sgt(MaxCaseVal))
		MaxCaseVal = CaseVal;
		if (CaseVal.slt(MinCaseVal))
		MinCaseVal = CaseVal;
		}

		// Check if suitable for a bit test
		if (N <= DL.getPointerSizeInBits()) {
		SmallPtrSet<const BasicBlock *, 4> Dests;
		for (auto I : SI.cases())
		Dests.insert(I.getCaseSuccessor());

		if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
		DL))
		return 1;
		}

		// Check if suitable for a jump table.
		if (IsJTAllowed) {
		if (N < 2 \|\| N < TLI->getMinimumJumpTableEntries())
		return N;
		uint64_t Range =
		(MaxCaseVal - MinCaseVal).getLimitedValue(UINT64_MAX - 1) + 1;
		// Check whether a range of clusters is dense enough for a jump table
		if (TLI->isSuitableForJumpTable(&SI, N, Range)) {
		JumpTableSize = Range;
		return 1;
		}
		}
		return N;
		}

unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }		unsigned getJumpBufAlignment() { return getTLI()->getJumpBufAlignment(); }

unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }		unsigned getJumpBufSize() { return getTLI()->getJumpBufSize(); }

bool shouldBuildLookupTables() {		bool shouldBuildLookupTables() {
const TargetLoweringBase *TLI = getTLI();		const TargetLoweringBase *TLI = getTLI();
return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|		return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);		TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
▲ Show 20 Lines • Show All 939 Lines • Show Last 20 Lines

llvm/trunk/include/llvm/Target/TargetLowering.h

Show First 20 Lines • Show All 769 Lines • ▼ Show 20 Lines	public:

/// Return true if the specified operation is illegal but has a custom lowering		/// Return true if the specified operation is illegal but has a custom lowering
/// on that type. This is used to help guide high-level lowering		/// on that type. This is used to help guide high-level lowering
/// decisions.		/// decisions.
bool isOperationCustom(unsigned Op, EVT VT) const {		bool isOperationCustom(unsigned Op, EVT VT) const {
return (!isTypeLegal(VT) && getOperationAction(Op, VT) == Custom);		return (!isTypeLegal(VT) && getOperationAction(Op, VT) == Custom);
}		}

		/// Return true if lowering to a jump table is allowed.
		bool areJTsAllowed(const Function *Fn) const {
		if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
		return false;

		return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
		isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
		}

		/// Check whether the range [Low,High] fits in a machine word.
		bool rangeFitsInWord(const APInt &Low, const APInt &High,
		const DataLayout &DL) const {
		// FIXME: Using the pointer type doesn't seem ideal.
		uint64_t BW = DL.getPointerSizeInBits();
		uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
		return Range <= BW;
		}

		/// Return true if lowering to a jump table is suitable for a set of case
		/// clusters which may contain \p NumCases cases, \p Range range of values.
		/// FIXME: This function check the maximum table size and density, but the
		/// minimum size is not checked. It would be nice if the the minimum size is
		/// also combined within this function. Currently, the minimum size check is
		/// performed in findJumpTable() in SelectionDAGBuiler and
		/// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
		bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
		uint64_t Range) const {
		const bool OptForSize = SI->getParent()->getParent()->optForSize();
		const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
		const unsigned MaxJumpTableSize =
		OptForSize \|\| getMaximumJumpTableSize() == 0
		? UINT_MAX
		: getMaximumJumpTableSize();
		// Check whether a range of clusters is dense enough for a jump table.
		if (Range <= MaxJumpTableSize &&
		(NumCases * 100 >= Range * MinDensity)) {
		return true;
		}
		return false;
		}

		/// Return true if lowering to a bit test is suitable for a set of case
		/// clusters which contains \p NumDests unique destinations, \p Low and
		/// \p High as its lowest and highest case values, and expects \p NumCmps
		/// case value comparisons. Check if the number of destinations, comparison
		/// metric, and range are all suitable.
		bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
		const APInt &Low, const APInt &High,
		const DataLayout &DL) const {
		// FIXME: I don't think NumCmps is the correct metric: a single case and a
		// range of cases both require only one branch to lower. Just looking at the
		// number of clusters and destinations should be enough to decide whether to
		// build bit tests.

		// To lower a range with bit tests, the range must fit the bitwidth of a
		// machine word.
		if (!rangeFitsInWord(Low, High, DL))
		return false;

		// Decide whether it's profitable to lower this range with bit tests. Each
		// destination requires a bit test and branch, and there is an overall range
		// check branch. For a small number of clusters, separate comparisons might
		// be cheaper, and for many destinations, splitting the range might be
		// better.
		return (NumDests == 1 && NumCmps >= 3) \|\| (NumDests == 2 && NumCmps >= 5) \|\|
		(NumDests == 3 && NumCmps >= 6);
		}

/// Return true if the specified operation is illegal on this target or		/// Return true if the specified operation is illegal on this target or
/// unlikely to be made legal with custom lowering. This is used to help guide		/// unlikely to be made legal with custom lowering. This is used to help guide
/// high-level lowering decisions.		/// high-level lowering decisions.
bool isOperationExpand(unsigned Op, EVT VT) const {		bool isOperationExpand(unsigned Op, EVT VT) const {
return (!isTypeLegal(VT) \|\| getOperationAction(Op, VT) == Expand);		return (!isTypeLegal(VT) \|\| getOperationAction(Op, VT) == Expand);
}		}

/// Return true if the specified operation is legal on this target.		/// Return true if the specified operation is legal on this target.
▲ Show 20 Lines • Show All 358 Lines • ▼ Show 20 Lines	public:
/// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.		/// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.
bool usesUnderscoreLongJmp() const {		bool usesUnderscoreLongJmp() const {
return UseUnderscoreLongJmp;		return UseUnderscoreLongJmp;
}		}

/// Return lower limit for number of blocks in a jump table.		/// Return lower limit for number of blocks in a jump table.
unsigned getMinimumJumpTableEntries() const;		unsigned getMinimumJumpTableEntries() const;

		/// Return lower limit of the density in a jump table.
		unsigned getMinimumJumpTableDensity(bool OptForSize) const;

/// Return upper limit for number of entries in a jump table.		/// Return upper limit for number of entries in a jump table.
/// Zero if no limit.		/// Zero if no limit.
unsigned getMaximumJumpTableSize() const;		unsigned getMaximumJumpTableSize() const;

virtual bool isJumpTableRelative() const {		virtual bool isJumpTableRelative() const {
return TM.isPositionIndependent();		return TM.isPositionIndependent();
}		}

▲ Show 20 Lines • Show All 2,111 Lines • Show Last 20 Lines

llvm/trunk/lib/Analysis/InlineCost.cpp

Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	static cl::opt<int> HintThreshold(
"inlinehint-threshold", cl::Hidden, cl::init(325),		"inlinehint-threshold", cl::Hidden, cl::init(325),
cl::desc("Threshold for inlining functions with inline hint"));		cl::desc("Threshold for inlining functions with inline hint"));

static cl::opt<int>		static cl::opt<int>
ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,		ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
cl::init(45),		cl::init(45),
cl::desc("Threshold for inlining cold callsites"));		cl::desc("Threshold for inlining cold callsites"));

		static cl::opt<bool>
		EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
		cl::init(false),
		cl::desc("Enable generic switch cost model"));

// We introduce this threshold to help performance of instrumentation based		// We introduce this threshold to help performance of instrumentation based
// PGO before we actually hook up inliner with analysis passes such as BPI and		// PGO before we actually hook up inliner with analysis passes such as BPI and
// BFI.		// BFI.
static cl::opt<int> ColdThreshold(		static cl::opt<int> ColdThreshold(
"inlinecold-threshold", cl::Hidden, cl::init(225),		"inlinecold-threshold", cl::Hidden, cl::init(225),
cl::desc("Threshold for inlining functions with cold attribute"));		cl::desc("Threshold for inlining functions with cold attribute"));

static cl::opt<int>		static cl::opt<int>
▲ Show 20 Lines • Show All 928 Lines • ▼ Show 20 Lines	bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
// We model unconditional switches as free, see the comments on handling		// We model unconditional switches as free, see the comments on handling
// branches.		// branches.
if (isa<ConstantInt>(SI.getCondition()))		if (isa<ConstantInt>(SI.getCondition()))
return true;		return true;
if (Value *V = SimplifiedValues.lookup(SI.getCondition()))		if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
if (isa<ConstantInt>(V))		if (isa<ConstantInt>(V))
return true;		return true;

// Otherwise, we need to accumulate a cost proportional to the number of		if (EnableGenericSwitchCost) {
// distinct successor blocks. This fan-out in the CFG cannot be represented		// Assume the most general case where the swith is lowered into
// for free even if we can represent the core switch as a jumptable that		// either a jump table, bit test, or a balanced binary tree consisting of
// takes a single instruction.		// case clusters without merging adjacent clusters with the same
//		// destination. We do not consider the switches that are lowered with a mix
		// of jump table/bit test/binary search tree. The cost of the switch is
		// proportional to the size of the tree or the size of jump table range.

		// Exit early for a large switch, assuming one case needs at least one
		// instruction.
		// FIXME: This is not true for a bit test, but ignore such case for now to
		// save compile-time.
		int64_t CostLowerBound =
		std::min((int64_t)INT_MAX,
		(int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);

		if (CostLowerBound > Threshold) {
		Cost = CostLowerBound;
		return false;
		}

		unsigned JumpTableSize = 0;
		unsigned NumCaseCluster =
		TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);

		// If suitable for a jump table, consider the cost for the table size and
		// branch to destination.
		if (JumpTableSize) {
		int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
		4 * InlineConstants::InstrCost;
		Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
		return false;
		}

		// Considering forming a binary search, we should find the number of nodes
		// which is same as the number of comparisons when lowered. For a given
		// number of clusters, n, we can define a recursive function, f(n), to find
		// the number of nodes in the tree. The recursion is :
		// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
		// and f(n) = n, when n <= 3.
		// This will lead a binary tree where the leaf should be either f(2) or f(3)
		// when n > 3. So, the number of comparisons from leaves should be n, while
		// the number of non-leaf should be :
		// 2^(log2(n) - 1) - 1
		// = 2^log2(n) * 2^-1 - 1
		// = n / 2 - 1.
		// Considering comparisons from leaf and non-leaf nodes, we can estimate the
		// number of comparisons in a simple closed form :
		// n + n / 2 - 1 = n * 3 / 2 - 1
		if (NumCaseCluster <= 3) {
		// Suppose a comparison includes one compare and one conditional branch.
		Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
		return false;
		}
		int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
		uint64_t SwitchCost =
		ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
		Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
		return false;
		}

		// Use a simple switch cost model where we accumulate a cost proportional to
		// the number of distinct successor blocks. This fan-out in the CFG cannot
		// be represented for free even if we can represent the core switch as a
		// jumptable that takes a single instruction.
		///
// NB: We convert large switches which are just used to initialize large phi		// NB: We convert large switches which are just used to initialize large phi
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent		// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
// inlining those. It will prevent inlining in cases where the optimization		// inlining those. It will prevent inlining in cases where the optimization
// does not (yet) fire.		// does not (yet) fire.
SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;		SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
SuccessorBlocks.insert(SI.getDefaultDest());		SuccessorBlocks.insert(SI.getDefaultDest());
for (auto Case : SI.cases())		for (auto Case : SI.cases())
SuccessorBlocks.insert(Case.getCaseSuccessor());		SuccessorBlocks.insert(Case.getCaseSuccessor());
▲ Show 20 Lines • Show All 583 Lines • Show Last 20 Lines

llvm/trunk/lib/Analysis/TargetTransformInfo.cpp

	Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines

	int TargetTransformInfo::getIntrinsicCost(			int TargetTransformInfo::getIntrinsicCost(
	Intrinsic::ID IID, Type RetTy, ArrayRef<const Value > Arguments) const {			Intrinsic::ID IID, Type RetTy, ArrayRef<const Value > Arguments) const {
	int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);			int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

				unsigned
				TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
				unsigned &JTSize) const {
				return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize);
				}

	int TargetTransformInfo::getUserCost(const User *U) const {			int TargetTransformInfo::getUserCost(const User *U) const {
	int Cost = TTIImpl->getUserCost(U);			int Cost = TTIImpl->getUserCost(U);
	assert(Cost >= 0 && "TTI should not produce negative costs!");			assert(Cost >= 0 && "TTI should not produce negative costs!");
	return Cost;			return Cost;
	}			}

	bool TargetTransformInfo::hasBranchDivergence() const {			bool TargetTransformInfo::hasBranchDivergence() const {
	return TTIImpl->hasBranchDivergence();			return TTIImpl->hasBranchDivergence();
	▲ Show 20 Lines • Show All 452 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

Show First 20 Lines • Show All 298 Lines • ▼ Show 20 Lines	struct BitTestBlock {
bool ContiguousRange;		bool ContiguousRange;
MachineBasicBlock *Parent;		MachineBasicBlock *Parent;
MachineBasicBlock *Default;		MachineBasicBlock *Default;
BitTestInfo Cases;		BitTestInfo Cases;
BranchProbability Prob;		BranchProbability Prob;
BranchProbability DefaultProb;		BranchProbability DefaultProb;
};		};

/// Check whether a range of clusters is dense enough for a jump table.		/// Return the range of value in [First..Last].
bool isDense(const CaseClusterVector &Clusters,		uint64_t getJumpTableRange(const CaseClusterVector &Clusters, unsigned First,
const SmallVectorImpl<unsigned> &TotalCases,		unsigned Last) const;
unsigned First, unsigned Last, unsigned MinDensity) const;
		/// Return the number of cases in [First..Last].
		uint64_t getJumpTableNumCases(const SmallVectorImpl<unsigned> &TotalCases,
		unsigned First, unsigned Last) const;

/// Build a jump table cluster from Clusters[First..Last]. Returns false if it		/// Build a jump table cluster from Clusters[First..Last]. Returns false if it
/// decides it's not a good idea.		/// decides it's not a good idea.
bool buildJumpTable(const CaseClusterVector &Clusters, unsigned First,		bool buildJumpTable(const CaseClusterVector &Clusters, unsigned First,
unsigned Last, const SwitchInst *SI,		unsigned Last, const SwitchInst *SI,
MachineBasicBlock *DefaultMBB, CaseCluster &JTCluster);		MachineBasicBlock *DefaultMBB, CaseCluster &JTCluster);

/// Find clusters of cases suitable for jump table lowering.		/// Find clusters of cases suitable for jump table lowering.
void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,		void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,
MachineBasicBlock *DefaultMBB);		MachineBasicBlock *DefaultMBB);

/// Check whether the range [Low,High] fits in a machine word.
bool rangeFitsInWord(const APInt &Low, const APInt &High);

/// Check whether these clusters are suitable for lowering with bit tests based
/// on the number of destinations, comparison metric, and range.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
const APInt &Low, const APInt &High);

/// Build a bit test cluster from Clusters[First..Last]. Returns false if it		/// Build a bit test cluster from Clusters[First..Last]. Returns false if it
/// decides it's not a good idea.		/// decides it's not a good idea.
bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last,		bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last,
const SwitchInst *SI, CaseCluster &BTCluster);		const SwitchInst *SI, CaseCluster &BTCluster);

/// Find clusters of cases suitable for bit test lowering.		/// Find clusters of cases suitable for bit test lowering.
void findBitTestClusters(CaseClusterVector &Clusters, const SwitchInst *SI);		void findBitTestClusters(CaseClusterVector &Clusters, const SwitchInst *SI);

▲ Show 20 Lines • Show All 685 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines
static unsigned LimitFloatPrecision;		static unsigned LimitFloatPrecision;

static cl::opt<unsigned, true>		static cl::opt<unsigned, true>
LimitFPPrecision("limit-float-precision",		LimitFPPrecision("limit-float-precision",
cl::desc("Generate low-precision inline sequences "		cl::desc("Generate low-precision inline sequences "
"for some float libcalls"),		"for some float libcalls"),
cl::location(LimitFloatPrecision),		cl::location(LimitFloatPrecision),
cl::init(0));		cl::init(0));

/// Minimum jump table density for normal functions.
static cl::opt<unsigned>
JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
cl::desc("Minimum density for building a jump table in "
"a normal function"));

/// Minimum jump table density for -Os or -Oz functions.
static cl::opt<unsigned>
OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden,
cl::desc("Minimum density for building a jump table in "
"an optsize function"));


// Limit the width of DAG chains. This is important in general to prevent		// Limit the width of DAG chains. This is important in general to prevent
// DAG-based analysis from blowing up. For example, alias analysis and		// DAG-based analysis from blowing up. For example, alias analysis and
// load clustering may not complete in reasonable time. It is difficult to		// load clustering may not complete in reasonable time. It is difficult to
// recognize and avoid this situation within each individual analysis, and		// recognize and avoid this situation within each individual analysis, and
// future analyses are likely to have the same behavior. Limiting DAG width is		// future analyses are likely to have the same behavior. Limiting DAG width is
// the safe approach and will be especially important with global DAGs.		// the safe approach and will be especially important with global DAGs.
//		//
// MaxParallelChains default is arbitrarily high to avoid affecting		// MaxParallelChains default is arbitrarily high to avoid affecting
▲ Show 20 Lines • Show All 8,476 Lines • ▼ Show 20 Lines
void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {		void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
// If the node is null, we do have a tail call.		// If the node is null, we do have a tail call.
if (MaybeTC.getNode() != nullptr)		if (MaybeTC.getNode() != nullptr)
DAG.setRoot(MaybeTC);		DAG.setRoot(MaybeTC);
else		else
HasTailCall = true;		HasTailCall = true;
}		}

bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,		uint64_t
const SmallVectorImpl<unsigned> &TotalCases,		SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters,
unsigned First, unsigned Last,		unsigned First, unsigned Last) const {
unsigned Density) const {
assert(Last >= First);		assert(Last >= First);
assert(TotalCases[Last] >= TotalCases[First]);

const APInt &LowCase = Clusters[First].Low->getValue();		const APInt &LowCase = Clusters[First].Low->getValue();
const APInt &HighCase = Clusters[Last].High->getValue();		const APInt &HighCase = Clusters[Last].High->getValue();
assert(LowCase.getBitWidth() == HighCase.getBitWidth());		assert(LowCase.getBitWidth() == HighCase.getBitWidth());

// FIXME: A range of consecutive cases has 100% density, but only requires one		// FIXME: A range of consecutive cases has 100% density, but only requires one
// comparison to lower. We should discriminate against such consecutive ranges		// comparison to lower. We should discriminate against such consecutive ranges
// in jump tables.		// in jump tables.

uint64_t Diff = (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100);		return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1;
uint64_t Range = Diff + 1;		}

		uint64_t SelectionDAGBuilder::getJumpTableNumCases(
		const SmallVectorImpl<unsigned> &TotalCases, unsigned First,
		unsigned Last) const {
		assert(Last >= First);
		assert(TotalCases[Last] >= TotalCases[First]);
uint64_t NumCases =		uint64_t NumCases =
TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);		TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
		return NumCases;
assert(NumCases < UINT64_MAX / 100);
assert(Range >= NumCases);

return NumCases * 100 >= Range * Density;
}

static inline bool areJTsAllowed(const TargetLowering &TLI,
const SwitchInst *SI) {
const Function *Fn = SI->getParent()->getParent();
if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
return false;

return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) \|\|
TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
}		}

bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,		bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
unsigned First, unsigned Last,		unsigned First, unsigned Last,
const SwitchInst *SI,		const SwitchInst *SI,
MachineBasicBlock *DefaultMBB,		MachineBasicBlock *DefaultMBB,
CaseCluster &JTCluster) {		CaseCluster &JTCluster) {
assert(First <= Last);		assert(First <= Last);
Show All 22 Lines	if (I != First) {
Table.push_back(DefaultMBB);		Table.push_back(DefaultMBB);
}		}
uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;		uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
for (uint64_t J = 0; J < ClusterSize; ++J)		for (uint64_t J = 0; J < ClusterSize; ++J)
Table.push_back(Clusters[I].MBB);		Table.push_back(Clusters[I].MBB);
JTProbs[Clusters[I].MBB] += Clusters[I].Prob;		JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
}		}

		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumDests = JTProbs.size();		unsigned NumDests = JTProbs.size();
if (isSuitableForBitTests(NumDests, NumCmps,		if (TLI.isSuitableForBitTests(
Clusters[First].Low->getValue(),		NumDests, NumCmps, Clusters[First].Low->getValue(),
Clusters[Last].High->getValue())) {		Clusters[Last].High->getValue(), DAG.getDataLayout())) {
// Clusters[First..Last] should be lowered as bit tests instead.		// Clusters[First..Last] should be lowered as bit tests instead.
return false;		return false;
}		}

// Create the MBB that will load from and jump through the table.		// Create the MBB that will load from and jump through the table.
// Note: We create it here, but it's not inserted into the function yet.		// Note: We create it here, but it's not inserted into the function yet.
MachineFunction *CurMF = FuncInfo.MF;		MachineFunction *CurMF = FuncInfo.MF;
MachineBasicBlock *JumpTableMBB =		MachineBasicBlock *JumpTableMBB =
CurMF->CreateMachineBasicBlock(SI->getParent());		CurMF->CreateMachineBasicBlock(SI->getParent());

// Add successors. Note: use table order for determinism.		// Add successors. Note: use table order for determinism.
SmallPtrSet<MachineBasicBlock *, 8> Done;		SmallPtrSet<MachineBasicBlock *, 8> Done;
for (MachineBasicBlock *Succ : Table) {		for (MachineBasicBlock *Succ : Table) {
if (Done.count(Succ))		if (Done.count(Succ))
continue;		continue;
addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);		addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
Done.insert(Succ);		Done.insert(Succ);
}		}
JumpTableMBB->normalizeSuccProbs();		JumpTableMBB->normalizeSuccProbs();

const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())		unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
->createJumpTableIndex(Table);		->createJumpTableIndex(Table);

// Set up the jump table info.		// Set up the jump table info.
JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);		JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);
JumpTableHeader JTH(Clusters[First].Low->getValue(),		JumpTableHeader JTH(Clusters[First].Low->getValue(),
Clusters[Last].High->getValue(), SI->getCondition(),		Clusters[Last].High->getValue(), SI->getCondition(),
nullptr, false);		nullptr, false);
Show All 12 Lines	#ifndef NDEBUG
assert(!Clusters.empty());		assert(!Clusters.empty());
for (CaseCluster &C : Clusters)		for (CaseCluster &C : Clusters)
assert(C.Kind == CC_Range);		assert(C.Kind == CC_Range);
for (unsigned i = 1, e = Clusters.size(); i < e; ++i)		for (unsigned i = 1, e = Clusters.size(); i < e; ++i)
assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));		assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));
#endif		#endif

const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (!areJTsAllowed(TLI, SI))		if (!TLI.areJTsAllowed(SI->getParent()->getParent()))
return;		return;

const bool OptForSize = DefaultMBB->getParent()->getFunction()->optForSize();

const int64_t N = Clusters.size();		const int64_t N = Clusters.size();
const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries();		const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries();
const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;		const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
const unsigned MaxJumpTableSize =
OptForSize \|\| TLI.getMaximumJumpTableSize() == 0
? UINT_MAX : TLI.getMaximumJumpTableSize();

if (N < 2 \|\| N < MinJumpTableEntries)		if (N < 2 \|\| N < MinJumpTableEntries)
return;		return;

// TotalCases[i]: Total nbr of cases in Clusters[0..i].		// TotalCases[i]: Total nbr of cases in Clusters[0..i].
SmallVector<unsigned, 8> TotalCases(N);		SmallVector<unsigned, 8> TotalCases(N);
for (unsigned i = 0; i < N; ++i) {		for (unsigned i = 0; i < N; ++i) {
const APInt &Hi = Clusters[i].High->getValue();		const APInt &Hi = Clusters[i].High->getValue();
const APInt &Lo = Clusters[i].Low->getValue();		const APInt &Lo = Clusters[i].Low->getValue();
TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;		TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;
if (i != 0)		if (i != 0)
TotalCases[i] += TotalCases[i - 1];		TotalCases[i] += TotalCases[i - 1];
}		}

const unsigned MinDensity =
OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;

// Cheap case: the whole range may be suitable for jump table.		// Cheap case: the whole range may be suitable for jump table.
unsigned JumpTableSize = (Clusters[N - 1].High->getValue() -		uint64_t Range = getJumpTableRange(Clusters,0, N - 1);
Clusters[0].Low->getValue())		uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1);
.getLimitedValue(UINT_MAX - 1) + 1;		assert(NumCases < UINT64_MAX / 100);
if (JumpTableSize <= MaxJumpTableSize &&		assert(Range >= NumCases);
isDense(Clusters, TotalCases, 0, N - 1, MinDensity)) {		if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
CaseCluster JTCluster;		CaseCluster JTCluster;
if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {		if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
Clusters[0] = JTCluster;		Clusters[0] = JTCluster;
Clusters.resize(1);		Clusters.resize(1);
return;		return;
}		}
}		}

Show All 36 Lines	for (int64_t i = N - 2; i >= 0; i--) {
// Baseline: Put Clusters[i] into a partition on its own.		// Baseline: Put Clusters[i] into a partition on its own.
MinPartitions[i] = MinPartitions[i + 1] + 1;		MinPartitions[i] = MinPartitions[i + 1] + 1;
LastElement[i] = i;		LastElement[i] = i;
PartitionsScore[i] = PartitionsScore[i + 1] + PartitionScores::SingleCase;		PartitionsScore[i] = PartitionsScore[i + 1] + PartitionScores::SingleCase;

// Search for a solution that results in fewer partitions.		// Search for a solution that results in fewer partitions.
for (int64_t j = N - 1; j > i; j--) {		for (int64_t j = N - 1; j > i; j--) {
// Try building a partition from Clusters[i..j].		// Try building a partition from Clusters[i..j].
JumpTableSize = (Clusters[j].High->getValue() -		uint64_t Range = getJumpTableRange(Clusters, i, j);
Clusters[i].Low->getValue())		uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j);
.getLimitedValue(UINT_MAX - 1) + 1;		assert(NumCases < UINT64_MAX / 100);
if (JumpTableSize <= MaxJumpTableSize &&		assert(Range >= NumCases);
isDense(Clusters, TotalCases, i, j, MinDensity)) {		if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);		unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];		unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
int64_t NumEntries = j - i + 1;		int64_t NumEntries = j - i + 1;

if (NumEntries == 1)		if (NumEntries == 1)
Score += PartitionScores::SingleCase;		Score += PartitionScores::SingleCase;
else if (NumEntries <= SmallNumberOfEntries)		else if (NumEntries <= SmallNumberOfEntries)
Score += PartitionScores::FewCases;		Score += PartitionScores::FewCases;
Show All 27 Lines	for (unsigned First = 0, Last; First < N; First = Last + 1) {
} else {		} else {
for (unsigned I = First; I <= Last; ++I)		for (unsigned I = First; I <= Last; ++I)
std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));		std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));
}		}
}		}
Clusters.resize(DstIndex);		Clusters.resize(DstIndex);
}		}

bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) {
// FIXME: Using the pointer type doesn't seem ideal.
uint64_t BW = DAG.getDataLayout().getPointerSizeInBits();
uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
return Range <= BW;
}

bool SelectionDAGBuilder::isSuitableForBitTests(unsigned NumDests,
unsigned NumCmps,
const APInt &Low,
const APInt &High) {
// FIXME: I don't think NumCmps is the correct metric: a single case and a
// range of cases both require only one branch to lower. Just looking at the
// number of clusters and destinations should be enough to decide whether to
// build bit tests.

// To lower a range with bit tests, the range must fit the bitwidth of a
// machine word.
if (!rangeFitsInWord(Low, High))
return false;

// Decide whether it's profitable to lower this range with bit tests. Each
// destination requires a bit test and branch, and there is an overall range
// check branch. For a small number of clusters, separate comparisons might be
// cheaper, and for many destinations, splitting the range might be better.
return (NumDests == 1 && NumCmps >= 3) \|\|
(NumDests == 2 && NumCmps >= 5) \|\|
(NumDests == 3 && NumCmps >= 6);
}

bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,		bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
unsigned First, unsigned Last,		unsigned First, unsigned Last,
const SwitchInst *SI,		const SwitchInst *SI,
CaseCluster &BTCluster) {		CaseCluster &BTCluster) {
assert(First <= Last);		assert(First <= Last);
if (First == Last)		if (First == Last)
return false;		return false;

BitVector Dests(FuncInfo.MF->getNumBlockIDs());		BitVector Dests(FuncInfo.MF->getNumBlockIDs());
unsigned NumCmps = 0;		unsigned NumCmps = 0;
for (int64_t I = First; I <= Last; ++I) {		for (int64_t I = First; I <= Last; ++I) {
assert(Clusters[I].Kind == CC_Range);		assert(Clusters[I].Kind == CC_Range);
Dests.set(Clusters[I].MBB->getNumber());		Dests.set(Clusters[I].MBB->getNumber());
NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;		NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
}		}
unsigned NumDests = Dests.count();		unsigned NumDests = Dests.count();

APInt Low = Clusters[First].Low->getValue();		APInt Low = Clusters[First].Low->getValue();
APInt High = Clusters[Last].High->getValue();		APInt High = Clusters[Last].High->getValue();
assert(Low.slt(High));		assert(Low.slt(High));

if (!isSuitableForBitTests(NumDests, NumCmps, Low, High))		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		const DataLayout &DL = DAG.getDataLayout();
		if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL))
return false;		return false;

APInt LowBound;		APInt LowBound;
APInt CmpRange;		APInt CmpRange;

const int BitWidth = DAG.getTargetLoweringInfo()		const int BitWidth = TLI.getPointerTy(DL).getSizeInBits();
.getPointerTy(DAG.getDataLayout())		assert(TLI.rangeFitsInWord(Low, High, DL) &&
.getSizeInBits();		"Case range must fit in bit mask!");
assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!");

// Check if the clusters cover a contiguous range such that no value in the		// Check if the clusters cover a contiguous range such that no value in the
// range will jump to the default statement.		// range will jump to the default statement.
bool ContiguousRange = true;		bool ContiguousRange = true;
for (int64_t I = First + 1; I <= Last; ++I) {		for (int64_t I = First + 1; I <= Last; ++I) {
if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {		if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
ContiguousRange = false;		ContiguousRange = false;
break;		break;
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines
#endif		#endif

// The algorithm below is not suitable for -O0.		// The algorithm below is not suitable for -O0.
if (TM.getOptLevel() == CodeGenOpt::None)		if (TM.getOptLevel() == CodeGenOpt::None)
return;		return;

// If target does not have legal shift left, do not emit bit tests at all.		// If target does not have legal shift left, do not emit bit tests at all.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT PTy = TLI.getPointerTy(DAG.getDataLayout());		const DataLayout &DL = DAG.getDataLayout();

		EVT PTy = TLI.getPointerTy(DL);
if (!TLI.isOperationLegal(ISD::SHL, PTy))		if (!TLI.isOperationLegal(ISD::SHL, PTy))
return;		return;

int BitWidth = PTy.getSizeInBits();		int BitWidth = PTy.getSizeInBits();
const int64_t N = Clusters.size();		const int64_t N = Clusters.size();

// MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].		// MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
SmallVector<unsigned, 8> MinPartitions(N);		SmallVector<unsigned, 8> MinPartitions(N);
Show All 14 Lines	for (int64_t i = N - 2; i >= 0; --i) {
LastElement[i] = i;		LastElement[i] = i;

// Search for a solution that results in fewer partitions.		// Search for a solution that results in fewer partitions.
// Note: the search is limited by BitWidth, reducing time complexity.		// Note: the search is limited by BitWidth, reducing time complexity.
for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {		for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {
// Try building a partition from Clusters[i..j].		// Try building a partition from Clusters[i..j].

// Check the range.		// Check the range.
if (!rangeFitsInWord(Clusters[i].Low->getValue(),		if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(),
Clusters[j].High->getValue()))		Clusters[j].High->getValue(), DL))
continue;		continue;

// Check nbr of destinations and cluster types.		// Check nbr of destinations and cluster types.
// FIXME: This works, but doesn't seem very efficient.		// FIXME: This works, but doesn't seem very efficient.
bool RangesOnly = true;		bool RangesOnly = true;
BitVector Dests(FuncInfo.MF->getNumBlockIDs());		BitVector Dests(FuncInfo.MF->getNumBlockIDs());
for (int64_t k = i; k <= j; k++) {		for (int64_t k = i; k <= j; k++) {
if (Clusters[k].Kind != CC_Range) {		if (Clusters[k].Kind != CC_Range) {
▲ Show 20 Lines • Show All 501 Lines • Show Last 20 Lines

llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp

	Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines
	static cl::opt<unsigned> MinimumJumpTableEntries			static cl::opt<unsigned> MinimumJumpTableEntries
	("min-jump-table-entries", cl::init(4), cl::Hidden,			("min-jump-table-entries", cl::init(4), cl::Hidden,
	cl::desc("Set minimum number of entries to use a jump table."));			cl::desc("Set minimum number of entries to use a jump table."));

	static cl::opt<unsigned> MaximumJumpTableSize			static cl::opt<unsigned> MaximumJumpTableSize
	("max-jump-table-size", cl::init(0), cl::Hidden,			("max-jump-table-size", cl::init(0), cl::Hidden,
	cl::desc("Set maximum size of jump tables; zero for no limit."));			cl::desc("Set maximum size of jump tables; zero for no limit."));

				/// Minimum jump table density for normal functions.
				static cl::opt<unsigned>
				JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
				cl::desc("Minimum density for building a jump table in "
				"a normal function"));

				/// Minimum jump table density for -Os or -Oz functions.
				static cl::opt<unsigned> OptsizeJumpTableDensity(
				"optsize-jump-table-density", cl::init(40), cl::Hidden,
				cl::desc("Minimum density for building a jump table in "
				"an optsize function"));

	// Although this default value is arbitrary, it is not random. It is assumed			// Although this default value is arbitrary, it is not random. It is assumed
	// that a condition that evaluates the same way by a higher percentage than this			// that a condition that evaluates the same way by a higher percentage than this
	// is best represented as control flow. Therefore, the default value N should be			// is best represented as control flow. Therefore, the default value N should be
	// set such that the win from N% correct executions is greater than the loss			// set such that the win from N% correct executions is greater than the loss
	// from (100 - N)% mispredicted executions for the majority of intended targets.			// from (100 - N)% mispredicted executions for the majority of intended targets.
	static cl::opt<int> MinPercentageForPredictableBranch(			static cl::opt<int> MinPercentageForPredictableBranch(
	"min-predictable-branch", cl::init(99),			"min-predictable-branch", cl::init(99),
	cl::desc("Minimum percentage (0-100) that a condition must be either true "			cl::desc("Minimum percentage (0-100) that a condition must be either true "
	▲ Show 20 Lines • Show All 1,832 Lines • ▼ Show 20 Lines
	unsigned TargetLoweringBase::getMinimumJumpTableEntries() const {			unsigned TargetLoweringBase::getMinimumJumpTableEntries() const {
	return MinimumJumpTableEntries;			return MinimumJumpTableEntries;
	}			}

	void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) {			void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) {
	MinimumJumpTableEntries = Val;			MinimumJumpTableEntries = Val;
	}			}

				unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const {
				return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
				}

	unsigned TargetLoweringBase::getMaximumJumpTableSize() const {			unsigned TargetLoweringBase::getMaximumJumpTableSize() const {
	return MaximumJumpTableSize;			return MaximumJumpTableSize;
	}			}

	void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {			void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
	MaximumJumpTableSize = Val;			MaximumJumpTableSize = Val;
	}			}

	▲ Show 20 Lines • Show All 183 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/Inline/AArch64/switch.ll

				; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true \| FileCheck %s
				; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true \| FileCheck %s

				define i32 @callee_range(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 0, label %sw.bb0
				i32 1000, label %sw.bb1
				i32 2000, label %sw.bb1
				i32 3000, label %sw.bb1
				i32 4000, label %sw.bb1
				i32 5000, label %sw.bb1
				i32 6000, label %sw.bb1
				i32 7000, label %sw.bb1
				i32 8000, label %sw.bb1
				i32 9000, label %sw.bb1
				]

				sw.default:
				store volatile i32 %a, i32* %P
				br label %return
				sw.bb0:
				store volatile i32 %a, i32* %P
				br label %return
				sw.bb1:
				store volatile i32 %a, i32* %P
				br label %return
				return:
				ret i32 42
				}

				define i32 @caller_range(i32 %a, i32* %P) {
				; CHECK-LABEL: @caller_range(
				; CHECK: call i32 @callee_range
				%r = call i32 @callee_range(i32 %a, i32* %P)
				ret i32 %r
				}

				define i32 @callee_bittest(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 0, label %sw.bb0
				i32 1, label %sw.bb1
				i32 2, label %sw.bb2
				i32 3, label %sw.bb0
				i32 4, label %sw.bb1
				i32 5, label %sw.bb2
				i32 6, label %sw.bb0
				i32 7, label %sw.bb1
				i32 8, label %sw.bb2
				]

				sw.default:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb0:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb1:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb2:
				br label %return

				return:
				ret i32 42
				}


				define i32 @caller_bittest(i32 %a, i32* %P) {
				; CHECK-LABEL: @caller_bittest(
				; CHECK-NOT: call i32 @callee_bittest
				%r= call i32 @callee_bittest(i32 %a, i32* %P)
				ret i32 %r
				}

				define i32 @callee_jumptable(i32 %a, i32* %P) {
				switch i32 %a, label %sw.default [
				i32 1001, label %sw.bb101
				i32 1002, label %sw.bb102
				i32 1003, label %sw.bb103
				i32 1004, label %sw.bb104
				i32 1005, label %sw.bb101
				i32 1006, label %sw.bb102
				i32 1007, label %sw.bb103
				i32 1008, label %sw.bb104
				i32 1009, label %sw.bb101
				i32 1010, label %sw.bb102
				i32 1011, label %sw.bb103
				i32 1012, label %sw.bb104
				]

				sw.default:
				br label %return

				sw.bb101:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb102:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb103:
				store volatile i32 %a, i32* %P
				br label %return

				sw.bb104:
				store volatile i32 %a, i32* %P
				br label %return

				return:
				ret i32 42
				}

				define i32 @caller_jumptable(i32 %a, i32 %b, i32* %P) {
				; CHECK-LABEL: @caller_jumptable(
				; CHECK: call i32 @callee_jumptable
				%r = call i32 @callee_jumptable(i32 %b, i32* %P)
				ret i32 %r
				}