This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/CodeGen/
-
CodeGen/
29/62
MachineBlockPlacement.cpp
-
test/CodeGen/
-
CodeGen/
-
AArch64/
-
branch-relax-cbz.ll
-
combine-comparisons-by-cse.ll
-
optimize-cond-branch.ll
-
AMDGPU/
-
basic-branch.ll
-
branch-relaxation.ll
-
cf-loop-on-constant.ll
-
convergent-inlineasm.ll
-
salu-to-valu.ll
-
ARM/
-
2007-05-22-tailmerge-3.ll
-
atomic-cmpxchg.ll
-
fold-stack-adjust.ll
-
PowerPC/
-
tail-dup-break-cfg.ll
3/9
tail-dup-layout.ll
-
SPARC/
-
sjlj.ll
-
WebAssembly/
-
mem-intrinsics.ll
-
X86/
-
block-placement.ll
-
bypass-slow-division-32.ll
-
sse1.ll
-
tail-dup-merge-loop-headers.ll
-
tail-dup-repeat.ll
-
tail-opts.ll
-
twoaddr-coalesce-3.ll
-
win-alloca-expander.ll

Differential D28522

Codegen: Make chains from trellis-shaped CFGs
ClosedPublic

Authored by iteratee on Jan 10 2017, 11:39 AM.

Download Raw Diff

Details

Reviewers

davidxl
• tstellarAMD
jlebar
javed.absar

Commits

rG7fbec9bdf1b7: Codegen: Make chains from trellis-shaped CFGs
rL295223: Codegen: Make chains from trellis-shaped CFGs

Summary

Lay out trellis-shaped CFGs optimally.
A trellis of the shape below:

A     B
|\   /|
| \ / |
|  X  |
| / \ |
|/   \|
C     D

would be laid out A; B->C ; D by the current layout algorithm. Now we identify
trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an
increasing number of predecessors. A trellis is a a group of 2 or more
predecessor blocks that all have the same successors.

because of this we can tail duplicate to extend existing trellises.

As an example consider the following CFG:

  B   D   F   H
 / \ / \ / \ / \
A---C---E---G---Ret

Where A,C,E,G are all small (Currently 2 instructions).

The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret.

The current code will copy C into B, E into D and G into F and yield the layout
A,C,B(C),E,D(E),F(G),G,H,ret

define void @straight_test(i32 %tag) {
entry:
  br label %test1
test1: ; A
  %tagbit1 = and i32 %tag, 1
  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
  br i1 %tagbit1eq0, label %test2, label %optional1
optional1: ; B
  call void @a()
  br label %test2
test2: ; C
  %tagbit2 = and i32 %tag, 2
  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
  br i1 %tagbit2eq0, label %test3, label %optional2
optional2: ; D
  call void @b()
  br label %test3
test3: ; E
  %tagbit3 = and i32 %tag, 4
  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
  br i1 %tagbit3eq0, label %test4, label %optional3
optional3: ; F
  call void @c()
  br label %test4
test4: ; G
  %tagbit4 = and i32 %tag, 8
  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
  br i1 %tagbit4eq0, label %exit, label %optional4
optional4: ; H
  call void @d()
  br label %exit
exit:
  ret void
}

here is the layout after D27742:

straight_test:                          # @straight_test
; ... Prologue elided
; BB#0:                                 # %entry ; A (merged with test1)
; ... More prologue elided
        mr 30, 3
        andi. 3, 30, 1
        bc 12, 1, .LBB0_2
; BB#1:                                 # %test2 ; C
        rlwinm. 3, 30, 0, 30, 30
        beq      0, .LBB0_3
        b .LBB0_4
.LBB0_2:                                # %optional1 ; B (copy of C)
        bl a
        nop
        rlwinm. 3, 30, 0, 30, 30
        bne      0, .LBB0_4
.LBB0_3:                                # %test3 ; E
        rlwinm. 3, 30, 0, 29, 29
        beq      0, .LBB0_5
        b .LBB0_6
.LBB0_4:                                # %optional2 ; D (copy of E)
        bl b
        nop
        rlwinm. 3, 30, 0, 29, 29
        bne      0, .LBB0_6
.LBB0_5:                                # %test4 ; G
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
        b .LBB0_7
.LBB0_6:                                # %optional3 ; F (copy of G)
        bl c
        nop
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
        bl d
        nop
.LBB0_8:                                # %exit ; Ret
        ld 30, 96(1)                    # 8-byte Folded Reload
        addi 1, 1, 112
        ld 0, 16(1)
        mtlr 0
        blr

The tail-duplication has produced some benefit, but it has also produced a
trellis which is not laid out optimally. With this patch, we improve the layouts
of such trellises, and decrease the cost calculation for tail-duplication
accordingly.

This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have
back edges, which is a negative, but it has a bigger compensating
positive, which is that it handles the case where there are long strings
of skipped blocks much better than the original layout. Both layouts
handle runs of executed blocks equally well. Branch prediction also
improves if there is any correlation between subsequent optional blocks.

Here is the resulting concrete layout:

straight_test:                          # @straight_test
; BB#0:                                 # %entry ; A (merged with test1)
        mr 30, 3
        andi. 3, 30, 1
        bc 12, 1, .LBB0_4
; BB#1:                                 # %test2 ; C
        rlwinm. 3, 30, 0, 30, 30
        bne      0, .LBB0_5
.LBB0_2:                                # %test3 ; E
        rlwinm. 3, 30, 0, 29, 29
        bne      0, .LBB0_6
.LBB0_3:                                # %test4 ; G
        rlwinm. 3, 30, 0, 28, 28
        bne      0, .LBB0_7
        b .LBB0_8
.LBB0_4:                                # %optional1 ; B (Copy of C)
        bl a
        nop
        rlwinm. 3, 30, 0, 30, 30
        beq      0, .LBB0_2
.LBB0_5:                                # %optional2 ; D (Copy of E)
        bl b
        nop
        rlwinm. 3, 30, 0, 29, 29
        beq      0, .LBB0_3
.LBB0_6:                                # %optional3 ; F (Copy of G)
        bl c
        nop
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
        bl d
        nop
.LBB0_8:                                # %exit

Diff Detail

Event Timeline

iteratee updated this revision to Diff 83836.Jan 10 2017, 11:39 AM

iteratee retitled this revision from to Codegen: Make chains from lattice-shaped CFGs.

iteratee updated this object.

iteratee added a reviewer: davidxl.

iteratee set the repository for this revision to rL LLVM.

iteratee added subscribers: arsenm, wdng, nhaehnle and 12 others.

Herald added a reviewer: • tstellarAMD. · View Herald TranscriptJan 10 2017, 11:39 AM

iteratee updated this object.Jan 10 2017, 11:40 AM

iteratee edited edge metadata.

iteratee added a parent revision: D27742: CodeGen: Allow small copyable blocks to "break" the CFG..

davidxl added inline comments.Jan 10 2017, 1:27 PM

lib/CodeGen/MachineBlockPlacement.cpp
993	Is it better to relax the condition such that as long as Succ can be dup'd into one of the unplaced predecessors, return true?
999	Change the second 'C' to 'C (+ BB') ' where BB' is the dup of BB
1002	Better change 'B' to 'BB' to match the function argument name
1008	Change E to 'Succ'

Comment update as requested.

Herald edited edge metadata. · View Herald TranscriptJan 10 2017, 2:49 PM

Herald added a subscriber: dschuff. · View Herald Transcript

iteratee added inline comments.Jan 10 2017, 2:50 PM

lib/CodeGen/MachineBlockPlacement.cpp
993	That isn't sufficient, because we need layout to be repeatable. So when we encounter the lattice the second time, there will be no more copies left to make. Chandler also wanted to allow another pass besides layout to do the duplication (perhaps with a larger threshold or different heuristic).

I am not sure I understand your reply about 'repeatability'. Can you elaborate? The suggestion is that if 'Succ' can be dup'ed into 'D' which is unplaced, return true. It basically does the same as skipping 'C'. Besides, checking the successors is not the reliable way to determine if C is block with duplicated bb.

Add comments elaborating on why we use a CFG check rather than checking for partial copiability of blocks or keeping a list of blocks with copies.

Herald edited edge metadata. · View Herald TranscriptJan 10 2017, 3:27 PM

In D28522#641977, @davidxl wrote:

I am not sure I understand your reply about 'repeatability'. Can you elaborate? The suggestion is that if 'Succ' can be dup'ed into 'D' which is unplaced, return true. It basically does the same as skipping 'C'. Besides, checking the successors is not the reliable way to determine if C is block with duplicated bb.

Discussed offline and comments updated to reflect discussion.

iteratee mentioned this in D28583: CodeGen: Allow small copyable blocks to "break" the CFG..Jan 11 2017, 4:34 PM

This patch now adjusts the probability accounting in D28583 to account for lattice layout. This means that more duplications occur. Tests that were in D28583 are now here.

Herald edited edge metadata. · View Herald TranscriptJan 12 2017, 12:03 PM

iteratee edited parent revisions, added: D28583: CodeGen: Allow small copyable blocks to "break" the CFG.; removed: D27742: CodeGen: Allow small copyable blocks to "break" the CFG..Jan 12 2017, 12:04 PM

mzolotukhin removed a subscriber: mzolotukhin.Jan 12 2017, 2:43 PM

Rebased and modified the probability calculations to account for lattices.

Herald added a reviewer: javed.absar. · View Herald TranscriptJan 19 2017, 6:29 PM

Rebasing.

Added lattice check outside of the tail-duplication code.
Updated tests to match.
Added test for non tail-dup lattice.

Is this patch up to date?

Looking over this, It feels like I can split the lattice portion out and put it in first. Even together they aren't very big, so If I can get some initial opinions on this before I start that, it would be appreciated.

I wanted to lay out lattice-type CFG's correctly even if the blocks are larger than what we would tail-duplicate.
To do that I worked out that for a lattice, we can compute which pair of edges forms the optimal fallthrough and use those edges. We don't really have to worry about CFG breaking, because with a lattice we can easily compute the optimal fallthrough pair and take it.

Lattice:
A Set of Predecessor blocks P that all have the same Successors S when |S| >= 2, |P| >=2 and S ∩ P = ∅
We can treat this as a graph optimization problem. There's a well known general algorithm, (the hungarian algorithm) but I just solved it for size 2 because that's the only size we really care about.

Thanks for taking a look, I'll try and get the plain lattice code separated from the tail duplication code tomorrow.

In D28522#665583, @iteratee wrote:

Looking over this, It feels like I can split the lattice portion out and put it in first. Even together they aren't very big, so If I can get some initial opinions on this before I start that, it would be appreciated.

I thought about this, and looked at what would go into the 2 patches. I don't think it's worth it now.

I updated the comments to make it more clear that we're doing this for general lattices and tail-duplication gets the benefit as well.

davidxl added inline comments.Feb 6 2017, 11:17 AM

lib/CodeGen/MachineBlockPlacement.cpp
771	merge error here -- two returns
816	Have a high level description of the selection algo here as comments.
843	tail duplication can create this pattern - why is it skipped? ` `BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / / S1 <-+` `
846	merge this with the insert before?
1241	unrelated change?
1335	unrelated change?

Handle Lattices with inter-successor edges.

More comments.

iteratee marked 4 inline comments as done.Feb 6 2017, 4:29 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
846	It can't, we have to count those predecessors, we just don't have to check them again.

davidxl added inline comments.Feb 7 2017, 11:02 AM

lib/CodeGen/MachineBlockPlacement.cpp
817	Since lattice with triangle is handled, the definition of lattice here is no longer accurate -- some predecessor does not have the same successors as BB.
846	Can you move insert call down after ++PreCount. If insertion returns false, continue.
847	how does this handle triangle case? In the example, S2 is predecessor of S1, but does not have the same successors of BB.
880	It is better to split the BestEdges into two BestIncomingEdges vector one for each viable successors and then sort them (or BestOutgoingEdges vectors one for each precessor). This makes the following code much more readable.
901	don't conflict (aka sharing the same successor)
912	If BestEdges are split, there is no need to do linear search for the best incoming edge for each successor -- after sorting, they are already accessible. Split BestEdges according to Predecessor is fine too. Either way, it is easy to detect conflict.
1296	Extract the special handling of lattice code into a helper to make the main flow of the caller cleaner.

Add back missing statement to handle triangles lost in merge.
Other tidying.

iteratee marked 4 inline comments as done.Feb 7 2017, 1:31 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
847	There was a merge mistake where it disappeared, sorry. I put it back.
880	Good catch, that is simpler.
1296	It already is in 2 helpers. Ignoring the logging the logic is: if (isLattice) return getBestLattice(); I don't see how a helper would be useful in this case.

davidxl added inline comments.Feb 7 2017, 1:48 PM

lib/CodeGen/MachineBlockPlacement.cpp
1296	I suggest pushing isLattice check into getBestLatticeSuccessor call. Also pushing the debug tracking code there too. The caller will be like if (LatticeSucc = getBestLatticeSuccessor(...)) { return BlockAndTailDupResult(LatticeSucc, false}' }
1314	Is this null? can you explain this code?

Move logging so that control flow is more obvious.

iteratee added inline comments.Feb 7 2017, 2:52 PM

lib/CodeGen/MachineBlockPlacement.cpp
1296	Keeping them separate makes the early return logic simpler, but I moved the debugging code.
1314	Yes. If BB is part of a lattice, but not an optimal edge, then we return early. We've already determined that all of BB's successors have a better fallthrough predecessor.

davidxl added inline comments.Feb 7 2017, 3:52 PM

lib/CodeGen/MachineBlockPlacement.cpp
873	The assert seems redundant - lattice shape check already checks number of predecessors.
877	Since the non-lattice based layout algorithm looks at cfg edges in forward direction (i.e. look at successor edges), it looks wrong to use best incoming edges to detect conflict. The conflict exists when the best outgoing edges from two predecessor share the same successor. Example: (skip to the end of this example to see general algorithm). BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / S1 If best outgoing edges of BB is BB->S1, while the best outgoing edges of Pred is Pred->S2, then there is no conflict. 1.1 If there is no triangle (from S2->S1), then the best successor for BB here is S1. 1.2 If there is an edge from S2 to S1 (forming triangle), if Freq(S2->S1) > Freq(BB->S1), then we prefer layout Pred->S2->S1, so BB's best successor should be null. If the best outgoing edge of BB and Pred is the same say S1, then there is a conflict. 2.1. no triangle: if Freq(BB->S1) > Freq(Pred->S1), return S1 as the best successor for BB, otherwise S2 2.2. there is an edge from S2 to S1: .... Actually if you look at all the special cases, there is a more general algorithm to get the optimal solution for all shapes: among all 4 edges (5 edges for the triangular case), find two non conflicting edges as the fallthrough edges such that their frequency sum is maximal. (two edges are conflicting if they share either source or dest BB).

If the lattice contains a triangle, we may want to tail duplicate instead. Check for that.

iteratee updated this revision to Diff 87569.Feb 7 2017, 5:26 PM

iteratee set the repository for this revision to rL LLVM.

iteratee added a reviewer: jlebar.

davidxl added inline comments.Feb 8 2017, 12:17 PM

lib/CodeGen/MachineBlockPlacement.cpp
852	Using tuple does not increase readability (e.g. mapping get<0> ... to actual field). It is better to just use a struct.
870	To greatly increase readability, please put code between line 917 and line 934 into a helper function: getBestFallEdgesInLattice(..) with comment like: // Find two non-conflicting edges with maximal total frequency in the lattice to be used as fall through.

davidxl added inline comments.Feb 8 2017, 12:24 PM

lib/CodeGen/MachineBlockPlacement.cpp
889	Do early return if condition is not true to reduce nesting level.
893	The code will be cleaner if early return is done here too and let the tail merging checking follow.
915	Why do you need to do chain merging here? The method is supposed to do analysis only.

iteratee updated this revision to Diff 87725.Feb 8 2017, 3:35 PM

iteratee marked 4 inline comments as done.Feb 8 2017, 3:40 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
893	Tail duplication is the early return case.
915	Well, I'm open to suggestions, but if we don't merge the triangle edge, it doesn't get chosen. We know it's optimal right now. Should we have a src->dest map that saves the other side of lattices and then follow it when we are trying to choose the successor for src? That would be cleaner.

Kyle asked me to look over this from an outsider's perspective and check for understandability. But after starting on it, I think this would be easier after David's comments have been addressed. I just have a few comments for now.

lib/CodeGen/MachineBlockPlacement.cpp
820	One thing that's confusing to me is: What exactly is "in" the lattice? We say that BB is part of the lattice if its successors form a lattice. So it sounds like a recursive definition. But then the criteria for whether BB's successors "form a lattice" is different from the criteria for whether BB is itself in a lattice. The other thing I have no intuition for here is, why do we use the mathematical word "lattice" for this shape? I'm sure there's a good reason, and understanding that might help in general.
993	Can we run clang-format over this patch? git-clang-format, included in the clang sources, can run it just over your changes, so you don't have to reformat the whole file. For instance, this line appears to fit in 80 chars, so doesn't need to be wrapped. Also we usually put "&&" at the end of the line, not the beginning. And there are some lines that appear to be longer than 80 chars. If it helps, I have a script that runs git-clang-format on every arc diff so I don't have to remember to do it myself. https://github.com/jlebar/conf/blob/master/bin/arc
1301	Nit, we usually omit these braces, even though the if-body is multiline.

Patch actually updated to match comments from last time.

davidxl added inline comments.Feb 12 2017, 10:43 PM

lib/CodeGen/MachineBlockPlacement.cpp
915	ideally this should not happen. If it happens, it means the lattice based cost analysis is not consistent with current heuristic of determine better layout predecessor (which is likely the case). We should probably enhance that logic in the future?
1016	This comment does not seem to be correct. D can not be C's fall through. The lattice analysis has decided that D is the fall-through of BB. You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor.

only git-clang-format

iteratee marked an inline comment as done.Feb 13 2017, 11:07 AM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
820	I didn't have a good word for it. I also can't find one googling. I'm open to suggestions. Lattice matched my initial intuitions, but really it's a subgraph where all maximal linear matchings are the same size.
915	I don't agree. We are conservative on purpose in the normal layout. When we find that we have a lattice, the need to be conservative shrinks. I don't expect them to get the same answer. Even if we did, I think it would be wasteful to re-compute the answer.
1016	I reread it. It's correct. D should be the fallthrough successor of (C+BB). You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor. I'm not sure what you mean by the above. The comment already says "unplaced predecessors"

Improved a comment, removed braces.

davidxl added inline comments.Feb 13 2017, 12:06 PM

lib/CodeGen/MachineBlockPlacement.cpp
915	It is not ideal, but my point still holds -- it is not a good idea to embed chain transformation code inside analysis code. If you want to improve the situation so that the analysis result can be better maintained -- that is fine -- but probably as a diferent patch.
1016	For the lattice including BB, C+BB, Succ and D, only when {BestA, BestB} == { BB->D, D->Succ}, will the tail duplication check is called. Does it mean D can not be C+BB's successor? What I meant is you can pass 'D' as a parameter to this call and only check if Succ can be tail dup into D.

iteratee added inline comments.Feb 13 2017, 12:53 PM

lib/CodeGen/MachineBlockPlacement.cpp
915	Then I'll do it the way I suggested. We already did the analysis for the other side of the lattice, we should save it, and then trust it when we get there.
1016	Yes, but if we tail-duplicate then C+BB has D as a fallthrough. That's why we can ignore it.

Save the analysis for the other side of the trellis so that we don't recompute it later.

Rename lattice to trellis, because it better matches existing usage.

With some changes this looks good for the part I was asked to review.

lib/CodeGen/MachineBlockPlacement.cpp
817	a trellis
818	trellises
836	Nit, auto*, or even just write out the type? "for (auto foo : bar)" is scary because it looks like you might be copying a nontrivially-sized object.
844	Do you want to avoid the double map lookup on SuccPred here? This function looks hot.
880	It looks like you don't actually care about anything other than the first two elements of this stable_sort? Last time I checked, std::stable_sort was relatively slow, and prone to allocate heap memory. If this is hot, you may want to avoid std::stable_sort. std::nth_element will sort of do what you want, except it doesn't seem to be stable. You may do better with a custom "GetTopTwo" function. Or maybe that's a premature optimization. :)
904	trellises? Now I'm not sure if this a typo or not, but I can't find "trelliss" as the plural in any dictionary I have onhand.
920	Nit, I'd move this down to under the comment that says "Collect the edge frequencies". Especially since the "2" in the constructor only makes sense after the if statement below.
925	Maybe "and a trellis of that size is basically unheard of"?
948	Any reason you don't want to return a SmallVector from getBestNonConflictingEdges? Or even an std::pair? Then you could say WeightedEdge BestA, BestB; std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); which seems closer to what you mean.
967	Personally I'd prefer not to use "auto" here. MachineBasicBlock* isn't too much to type out, and otherwise there's no obvious, nearby anchor for the reader as to what's going on.
1296	Nit, capital letter
1303	This is unrelated to your patch, so you don't need to change it or anything, but if we always check `!BlockFilter \|\| BlockFilter->count(Foo)`, then shouldn't we just pass BlockFilter by reference and initialize it to an empty map when necessary? This would be much more ergonomic.
1310	Capital "i", lower-case "U", period.

This revision is now accepted and ready to land.Feb 13 2017, 3:05 PM

iteratee updated this revision to Diff 88277.Feb 13 2017, 4:49 PM

iteratee marked 7 inline comments as done.

davidxl added inline comments.Feb 14 2017, 3:19 PM

test/CodeGen/PowerPC/tail-dup-layout.ll
204	The term 'unavoidable' is not well defined -- why is 'then2' unavoidable?
254	Is this comment relevant here?
267	change name to trellis_test
275	Using non-equal branch probability here to make the result more obvious in different scenarios: there are conflict in best incoming edges there are no conflict etc.
305	We probably also need a test for trellis+triangle shape (without taildup)
323	change name.

Improved test coverage in response to comments.

iteratee marked an inline comment as done.Feb 15 2017, 10:39 AM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
880	I don't expect the lists to be large in practice, so I'll just use stable_sort for now. It's simple enough to revisit if it becomes a bottleneck.
904	I messed up find and replace. It's fixed now.
test/CodeGen/PowerPC/tail-dup-layout.ll
254	Yes.
275	The test is now larger. It handles conflicting incoming edges, and a couple of non-conflicting edges, and a triangle.
305	f->ret is a triangle edge. I'll make the test bigger with non-balanced edges to cover more scenarios.

lgtm

Closed by commit rL295223: Codegen: Make chains from trellis-shaped CFGs (authored by iteratee). · Explain WhyFeb 15 2017, 12:00 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

CodeGen/

MachineBlockPlacement.cpp

269 lines

test/

CodeGen/

AArch64/

branch-relax-cbz.ll

13 lines

combine-comparisons-by-cse.ll

2 lines

optimize-cond-branch.ll

2 lines

AMDGPU/

basic-branch.ll

5 lines

branch-relaxation.ll

3 lines

cf-loop-on-constant.ll

2 lines

convergent-inlineasm.ll

1 line

salu-to-valu.ll

2 lines

ARM/

2007-05-22-tailmerge-3.ll

8 lines

atomic-cmpxchg.ll

8 lines

fold-stack-adjust.ll

2 lines

PowerPC/

tail-dup-break-cfg.ll

14 lines

tail-dup-layout.ll

269 lines

SPARC/

sjlj.ll

14 lines

WebAssembly/

mem-intrinsics.ll

2 lines

X86/

block-placement.ll

41 lines

bypass-slow-division-32.ll

15 lines

sse1.ll

34 lines

tail-dup-merge-loop-headers.ll

4 lines

tail-dup-repeat.ll

2 lines

tail-opts.ll

7 lines

twoaddr-coalesce-3.ll

4 lines

win-alloca-expander.ll

24 lines

Diff 87333

lib/CodeGen/MachineBlockPlacement.cpp

Show First 20 Lines • Show All 434 Lines • ▼ Show 20 Lines	void rotateLoop(
const BlockFilterSet &LoopBlockSet);		const BlockFilterSet &LoopBlockSet);
void rotateLoopWithProfile(		void rotateLoopWithProfile(
BlockChain &LoopChain, const MachineLoop &L,		BlockChain &LoopChain, const MachineLoop &L,
const BlockFilterSet &LoopBlockSet);		const BlockFilterSet &LoopBlockSet);
void collectMustExecuteBBs();		void collectMustExecuteBBs();
void buildCFGChains();		void buildCFGChains();
void optimizeBranches();		void optimizeBranches();
void alignBlocks();		void alignBlocks();
		/// Returns true if a block should be tail-duplicated to increase fallthrough
		/// opportunities.
bool shouldTailDuplicate(MachineBasicBlock *BB);		bool shouldTailDuplicate(MachineBasicBlock *BB);
/// Check the edge frequencies to see if tail duplication will increase		/// Check the edge frequencies to see if tail duplication will increase
/// fallthroughs.		/// fallthroughs.
bool isProfitableToTailDup(		bool isProfitableToTailDup(
const MachineBasicBlock BB, const MachineBasicBlock Succ,		const MachineBasicBlock BB, const MachineBasicBlock Succ,
BranchProbability AdjustedSumProb,		BranchProbability AdjustedSumProb,
const BlockChain &Chain, const BlockFilterSet *BlockFilter);		const BlockChain &Chain, const BlockFilterSet *BlockFilter);
		/// Check for a lattice layout.
		bool isLattice(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter);
		/// Get the best successor given a lattice layout.
		MachineBasicBlock* getBestLatticeSuccessor(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter);
/// Returns true if a block can tail duplicate into all unplaced		/// Returns true if a block can tail duplicate into all unplaced
/// predecessors. Filters based on loop.		/// predecessors. Filters based on loop.
bool canTailDuplicateUnplacedPreds(		bool canTailDuplicateUnplacedPreds(
const MachineBasicBlock BB, MachineBasicBlock Succ,		const MachineBasicBlock BB, MachineBasicBlock Succ,
const BlockChain &Chain, const BlockFilterSet *BlockFilter);		const BlockChain &Chain, const BlockFilterSet *BlockFilter);

public:		public:
static char ID; // Pass identification, replacement for typeid		static char ID; // Pass identification, replacement for typeid
▲ Show 20 Lines • Show All 151 Lines • ▼ Show 20 Lines	getAdjustedProbability(BranchProbability OrigProb,
if (SuccProbN >= SuccProbD)		if (SuccProbN >= SuccProbD)
SuccProb = BranchProbability::getOne();		SuccProb = BranchProbability::getOne();
else		else
SuccProb = BranchProbability(SuccProbN, SuccProbD);		SuccProb = BranchProbability(SuccProbN, SuccProbD);

return SuccProb;		return SuccProb;
}		}

/// Check if a block should be tail duplicated.		/// Check if \p BB has exactly the successors in \p Successors.
		static bool hasSameSuccessors(
		MachineBasicBlock &BB,
		SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
		if (BB.succ_size() != Successors.size())
		return false;
		// We don't want to count self-loops
		if (Successors.count(&BB))
		return false;
		for (MachineBasicBlock *Succ : BB.successors())
		if (!Successors.count(Succ))
		return false;
		return true;
		}

		/// Check if a block should be tail duplicated to increase fallthrough
		/// opportunities.
/// \p BB Block to check.		/// \p BB Block to check.
bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {		bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
// Blocks with single successors don't create additional fallthrough		// Blocks with single successors don't create additional fallthrough
// opportunities. Don't duplicate them. TODO: When conditional exits are		// opportunities. Don't duplicate them. TODO: When conditional exits are
// analyzable, allow them to be duplicated.		// analyzable, allow them to be duplicated.
bool IsSimple = TailDup.isSimpleBB(BB);		bool IsSimple = TailDup.isSimpleBB(BB);

if (BB->succ_size() == 1)		if (BB->succ_size() == 1)
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	bool MachineBlockPlacement::isProfitableToTailDup(
// BB BB		// BB BB
// \| \Qout \| \		// \| \Qout \| \
// P\| C \| =		// P\| C \| =
// = C' \| C		// = C' \| C
// \| /Qin \| \|		// \| /Qin \| \|
// \| / \| C' (+Succ)		// \| / \| C' (+Succ)
// Succ Succ /\|		// Succ Succ /\|
// / \ \| \/ \|		// / \ \| \/ \|
// U/ =V = /= =		// U/ =V \| == \|
// / \ \| / \\|		// / \ \| / \\|
// D E D E		// D E D E
// '=' : Branch taken for that CFG edge		// '=' : Branch taken for that CFG edge
// Cost in the first case is: P + V		// Cost in the first case is: P + V
// For this calculation, we always assume P > Qout. If Qout > P		// For this calculation, we always assume P > Qout. If Qout > P
// The result of this function will be ignored at the caller.		// The result of this function will be ignored at the caller.
// Cost in the second case is: Qout + Qin * V + P * U + P * V		// Cost in the second case is: Qout + Qin * U + P * V
// TODO(iteratee): If we lay out D after Succ, the P * U term
// goes away. This logic is coming in D28522.

if (PDom == nullptr \|\| !Succ->isSuccessor(PDom)) {		if (PDom == nullptr \|\| !Succ->isSuccessor(PDom)) {
BranchProbability UProb = BestSuccSucc;		BranchProbability UProb = BestSuccSucc;
BranchProbability VProb = AdjustedSuccSumProb - UProb;		BranchProbability VProb = AdjustedSuccSumProb - UProb;
BlockFrequency V = SuccFreq * VProb;		BlockFrequency V = SuccFreq * VProb;
BlockFrequency QinV = Qin * VProb;		BlockFrequency QinU = Qin * UProb;
BlockFrequency BaseCost = P + V;		BlockFrequency BaseCost = P + V;
BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;		BlockFrequency DupCost = Qout + QinU + P * VProb;
return greaterWithBias(BaseCost, DupCost, EntryFreq);		return greaterWithBias(BaseCost, DupCost, EntryFreq);
		davidxlUnsubmitted Done Reply Inline Actions merge error here -- two returns davidxl: merge error here -- two returns
}		}
BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);		BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
BranchProbability VProb = AdjustedSuccSumProb - UProb;		BranchProbability VProb = AdjustedSuccSumProb - UProb;
BlockFrequency U = SuccFreq * UProb;		BlockFrequency U = SuccFreq * UProb;
BlockFrequency V = SuccFreq * VProb;		BlockFrequency V = SuccFreq * VProb;
// If there is a post-dominating successor, here is the calculation:		// If there is a post-dominating successor, here is the calculation:
// BB BB BB BB		// BB BB BB BB
// \| \Qout \| \ \| \Qout \| \		// \| \Qout \| \ \| \Qout \| \
// \|P C \| = \|P C \| =		// \|P C \| = \|P C \| =
// = C' \|P C = C' \|P C		// = C' \|P C = C' \|P C
// \| /Qin \| \| \| /Qin \| \|		// \| /Qin \| \| \| /Qin \| \|
// \| / \| C' (+Succ) \| / \| C' (+Succ)		// \| / \| C' (+Succ) \| / \| C' (+Succ)
// Succ Succ /\| Succ Succ /\|		// Succ Succ /\| Succ Succ /\|
// \| \ V \| \/ \| \| \ V \| \/ \|		// \| \ V \| \/ \| \| \ V \| \/ \|
// \|U \ \|U /\ \| \|U = \|U /\ \|		// \|U \ \|U /\ \| \|U = \|U /\ \|
// = D = = =\| \| D \| = =\|		// = D = = \= \| D \| = =\|
// \| / \|/ D \| / \|/ D		// \| / \|/ D \| / \|/ D
// \| / \| / \| = \| /		// \| / \| / \| = \| /
// \|/ \| / \|/ \| =		// \|/ \| / \|/ \| =
// Dom Dom Dom Dom		// Dom Dom Dom Dom
// '=' : Branch taken for that CFG edge		// '=' : Branch taken for that CFG edge
// The cost for taken branches in the first case is P + U		// The cost for taken branches in the first case is P + U
// The cost in the second case (assuming independence), given the layout:		// The cost in the second case (assuming independence), given the layout:
// BB, Succ, (C+Succ), D, Dom		// BB, Succ, (C+Succ), D, Dom
// is Qout + P * V + Qin * U		// is Qout + P * V + Qin * U
// compare P + U vs Qout + P + Qin * U.		// compare P + U vs Qout + P * U + Qin.
//		//
// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.		// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
//		//
// For the 3rd case, the cost is P + 2 * V		// For the 3rd case, the cost is P + 2 * V
// For the 4th case, the cost is Qout + Qin * U + P * V + V		// For the 4th case, the cost is Qout + Qin * U + P * V + V
// We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V		// We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
if (UProb > AdjustedSuccSumProb / 2		if (UProb > AdjustedSuccSumProb / 2
&& !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],		&& !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
UProb, UProb, Chain, BlockFilter)) {		UProb, UProb, Chain, BlockFilter))
// Cases 3 & 4		// Cases 3 & 4
return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),		return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
EntryFreq);		EntryFreq);
}
// Cases 1 & 2		// Cases 1 & 2
return greaterWithBias(		return greaterWithBias(
(P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);		(P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
		}

		/// Check for a lattice layout. \p BB forms a lattice if all of the
		/// predecessors of all of its successors have the same successors as BB. We
		davidxlUnsubmitted Done Reply Inline Actions Have a high level description of the selection algo here as comments. davidxl: Have a high level description of the selection algo here as comments.
		/// ignore lattices where BB doesn't have 2 successors because for fewer than 2,
		davidxlUnsubmitted Done Reply Inline Actions Since lattice with triangle is handled, the definition of lattice here is no longer accurate -- some predecessor does not have the same successors as BB. davidxl: Since lattice with triangle is handled, the definition of lattice here is no longer accurate…
		jlebarUnsubmitted Done Reply Inline Actions a trellis jlebar: a trellis
		/// it's trivial, and for 3 or greater they are very uncommon and complex to
		jlebarUnsubmitted Done Reply Inline Actions trellises jlebar: trellises
		/// compute optimally. We allow a triangle where one of BB's successors is a
		/// predecessor of the other.
		jlebarUnsubmitted Not Done Reply Inline Actions One thing that's confusing to me is: What exactly is "in" the lattice? We say that BB is part of the lattice if its successors form a lattice. So it sounds like a recursive definition. But then the criteria for whether BB's successors "form a lattice" is different from the criteria for whether BB is itself in a lattice. The other thing I have no intuition for here is, why do we use the mathematical word "lattice" for this shape? I'm sure there's a good reason, and understanding that might help in general. jlebar: One thing that's confusing to me is: What exactly is "in" the lattice? We say that BB is part…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions I didn't have a good word for it. I also can't find one googling. I'm open to suggestions. Lattice matched my initial intuitions, but really it's a subgraph where all maximal linear matchings are the same size. iteratee: I didn't have a good word for it. I also can't find one googling. I'm open to suggestions.
		bool MachineBlockPlacement::isLattice(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
		// Technically BB could form a lattice with branching factor higher than 2.
		// But that's extremely uncommon.
		if (BB->succ_size() != 2 \|\| ViableSuccs.size() != 2)
		return false;

		SmallPtrSet<const MachineBasicBlock *, 2>
		Successors(BB->succ_begin(), BB->succ_end());
		// To avoid reviewing the same predecessors twice.
		SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;

		for (auto Succ : ViableSuccs) {
		int PredCount = 0;
		jlebarUnsubmitted Done Reply Inline Actions Nit, auto, or even just write out the type? "for (auto foo : bar)" is scary because it looks like you might be copying a nontrivially-sized object. jlebar:* Nit, auto*, or even just write out the type? "for (auto foo : bar)" is scary because it looks…
		for (auto SuccPred : Succ->predecessors()) {
		SeenPreds.insert(SuccPred);
		if (SuccPred == BB
		\|\| (BlockFilter && !BlockFilter->count(SuccPred))
		\|\| BlockToChain[SuccPred] == &Chain
		\|\| BlockToChain[SuccPred] == BlockToChain[Succ])
		continue;
		davidxlUnsubmitted Done Reply Inline Actions tail duplication can create this pattern - why is it skipped? ` `BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / / S1 <-+` ` davidxl: tail duplication can create this pattern - why is it skipped? ` ``` `BB Pred \| \…
		++PredCount;
		jlebarUnsubmitted Done Reply Inline Actions Do you want to avoid the double map lookup on SuccPred here? This function looks hot. jlebar: Do you want to avoid the double map lookup on SuccPred here? This function looks hot.
		if (SeenPreds.count(SuccPred))
		continue;
		davidxlUnsubmitted Done Reply Inline Actions merge this with the insert before? davidxl: merge this with the insert before?
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions It can't, we have to count those predecessors, we just don't have to check them again. iteratee: It can't, we have to count those predecessors, we just don't have to check them again.
		davidxlUnsubmitted Done Reply Inline Actions Can you move insert call down after ++PreCount. If insertion returns false, continue. davidxl: Can you move insert call down after ++PreCount. If insertion returns false, continue.
		if (!hasSameSuccessors(*SuccPred, Successors))
		davidxlUnsubmitted Done Reply Inline Actions how does this handle triangle case? In the example, S2 is predecessor of S1, but does not have the same successors of BB. davidxl: how does this handle triangle case? In the example, S2 is predecessor of S1, but does not have…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions There was a merge mistake where it disappeared, sorry. I put it back. iteratee: There was a merge mistake where it disappeared, sorry. I put it back.
		return false;
		}
		// If one of the successors has only BB as a predecessor, it is not a
		// lattice.
		if (PredCount < 1)
		davidxlUnsubmitted Done Reply Inline Actions Using tuple does not increase readability (e.g. mapping get<0> ... to actual field). It is better to just use a struct. davidxl: Using tuple does not increase readability (e.g. mapping get<0> ... to actual field). It is…
		return false;
}		}
		return true;
		}

		/// Get the best successor from \p BB based on \p BB being part of a lattice.
		/// We only handle lattices with 2 successors, so the algorithm is
		/// straightforward: Find the best pair of edges that don't conflict. We find
		/// the best incoming edge for each successor in the lattice. If those conflict,
		/// we consider which of them should be replaced with the second best.
		MachineBasicBlock* MachineBlockPlacement::getBestLatticeSuccessor(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
		SmallPtrSet<const MachineBasicBlock *, 4>
		Successors(BB->succ_begin(), BB->succ_end());
		SmallVector<std::tuple<BlockFrequency,
		const MachineBasicBlock , MachineBasicBlock >, 8> BestEdges;
		davidxlUnsubmitted Done Reply Inline Actions To greatly increase readability, please put code between line 917 and line 934 into a helper function: getBestFallEdgesInLattice(..) with comment like: // Find two non-conflicting edges with maximal total frequency in the lattice to be used as fall through. davidxl: To greatly increase readability, please put code between line 917 and line 934 into a helper…

		// We assume size 2 because it's common. For general n, we would have to do
		// the Hungarian algorithm, but it's not worth the complexity because more
		davidxlUnsubmitted Done Reply Inline Actions The assert seems redundant - lattice shape check already checks number of predecessors. davidxl: The assert seems redundant - lattice shape check already checks number of predecessors.
		// than 2 successors is fairly uncommon, and a lattice is basically
		// non-existent.
		if (Successors.size() != 2 \|\| ViableSuccs.size() != 2)
		return nullptr;
		davidxlUnsubmitted Not Done Reply Inline Actions Since the non-lattice based layout algorithm looks at cfg edges in forward direction (i.e. look at successor edges), it looks wrong to use best incoming edges to detect conflict. The conflict exists when the best outgoing edges from two predecessor share the same successor. Example: (skip to the end of this example to see general algorithm). BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / S1 If best outgoing edges of BB is BB->S1, while the best outgoing edges of Pred is Pred->S2, then there is no conflict. 1.1 If there is no triangle (from S2->S1), then the best successor for BB here is S1. 1.2 If there is an edge from S2 to S1 (forming triangle), if Freq(S2->S1) > Freq(BB->S1), then we prefer layout Pred->S2->S1, so BB's best successor should be null. If the best outgoing edge of BB and Pred is the same say S1, then there is a conflict. 2.1. no triangle: if Freq(BB->S1) > Freq(Pred->S1), return S1 as the best successor for BB, otherwise S2 2.2. there is an edge from S2 to S1: .... Actually if you look at all the special cases, there is a more general algorithm to get the optimal solution for all shapes: among all 4 edges (5 edges for the triangular case), find two non conflicting edges as the fallthrough edges such that their frequency sum is maximal. (two edges are conflicting if they share either source or dest BB). davidxl: Since the non-lattice based layout algorithm looks at cfg edges in forward direction (i.e. look…

		// Collect the edge frequencies of all edges that form the lattice.
		for (auto Succ : ViableSuccs) {
		davidxlUnsubmitted Done Reply Inline Actions It is better to split the BestEdges into two BestIncomingEdges vector one for each viable successors and then sort them (or BestOutgoingEdges vectors one for each precessor). This makes the following code much more readable. davidxl: It is better to split the BestEdges into two BestIncomingEdges vector one for each viable…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Good catch, that is simpler. iteratee: Good catch, that is simpler.
		jlebarUnsubmitted Not Done Reply Inline Actions It looks like you don't actually care about anything other than the first two elements of this stable_sort? Last time I checked, std::stable_sort was relatively slow, and prone to allocate heap memory. If this is hot, you may want to avoid std::stable_sort. std::nth_element will sort of do what you want, except it doesn't seem to be stable. You may do better with a custom "GetTopTwo" function. Or maybe that's a premature optimization. :) jlebar: It looks like you don't actually care about anything other than the first two elements of this…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions I don't expect the lists to be large in practice, so I'll just use stable_sort for now. It's simple enough to revisit if it becomes a bottleneck. iteratee: I don't expect the lists to be large in practice, so I'll just use stable_sort for now. It's…
		for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
		// Skip any placed predecessors that are not BB
		if (SuccPred != BB)
		if((BlockFilter && !BlockFilter->count(SuccPred))
		\|\| BlockToChain[SuccPred] == &Chain
		\|\| BlockToChain[SuccPred] == BlockToChain[Succ])
		continue;
		BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred)
		* MBPI->getEdgeProbability(SuccPred, Succ);
		davidxlUnsubmitted Done Reply Inline Actions Do early return if condition is not true to reduce nesting level. davidxl: Do early return if condition is not true to reduce nesting level.
		BestEdges.push_back(std::make_tuple(EdgeFreq, SuccPred, Succ));
		}
		}

		davidxlUnsubmitted Not Done Reply Inline Actions The code will be cleaner if early return is done here too and let the tail merging checking follow. davidxl: The code will be cleaner if early return is done here too and let the tail merging checking…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Tail duplication is the early return case. iteratee: Tail duplication is the early return case.
		auto Cmp = [](
		std::tuple<BlockFrequency, const MachineBasicBlock, MachineBasicBlock> A,
		std::tuple<BlockFrequency, const MachineBasicBlock, MachineBasicBlock> B) {
		return std::get<0>(A) > std::get<0>(B);
		};

		// Sort the edges, and then for each successor, find the best and second best
		// incoming predecessor. If the best incoming predecessors don't conflict,
		davidxlUnsubmitted Done Reply Inline Actions don't conflict (aka sharing the same successor) davidxl: don't conflict (aka sharing the same successor)
		// then that is clearly the best layout. If there is a conflict, one of the
		// successors will have to fallthrough from the second best predecessor. We
		// compare which combination is better overall. Given that we have split them
		jlebarUnsubmitted Not Done Reply Inline Actions trellises? Now I'm not sure if this a typo or not, but I can't find "trelliss" as the plural in any dictionary I have onhand. jlebar: trellises? Now I'm not sure if this a typo or not, but I can't find "trelliss" as the plural…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions I messed up find and replace. It's fixed now. iteratee: I messed up find and replace. It's fixed now.
		// out by successor, we know that the second best will not conflict if the
		// best edges conflict.
		std::stable_sort(BestEdges.begin(), BestEdges.end(), Cmp);
		auto BestA = BestEdges.end();
		auto BestB = BestEdges.end();
		auto SecondBestA = BestEdges.end();
		auto SecondBestB = BestEdges.end();
		for (auto It = BestEdges.begin(); It != BestEdges.end(); ++It) {
		davidxlUnsubmitted Done Reply Inline Actions If BestEdges are split, there is no need to do linear search for the best incoming edge for each successor -- after sorting, they are already accessible. Split BestEdges according to Predecessor is fine too. Either way, it is easy to detect conflict. davidxl: If BestEdges are split, there is no need to do linear search for the best incoming edge for…
		auto &Edge = *It;
		const MachineBasicBlock Pred, Succ;
		std::tie(std::ignore, Pred, Succ) = Edge;
		davidxlUnsubmitted Not Done Reply Inline Actions Why do you need to do chain merging here? The method is supposed to do analysis only. davidxl: Why do you need to do chain merging here? The method is supposed to do analysis only.
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Well, I'm open to suggestions, but if we don't merge the triangle edge, it doesn't get chosen. We know it's optimal right now. Should we have a src->dest map that saves the other side of lattices and then follow it when we are trying to choose the successor for src? That would be cleaner. iteratee: Well, I'm open to suggestions, but if we don't merge the triangle edge, it doesn't get chosen.
		davidxlUnsubmitted Not Done Reply Inline Actions ideally this should not happen. If it happens, it means the lattice based cost analysis is not consistent with current heuristic of determine better layout predecessor (which is likely the case). We should probably enhance that logic in the future? davidxl: ideally this should not happen. If it happens, it means the lattice based cost analysis is not…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions I don't agree. We are conservative on purpose in the normal layout. When we find that we have a lattice, the need to be conservative shrinks. I don't expect them to get the same answer. Even if we did, I think it would be wasteful to re-compute the answer. iteratee: I don't agree. We are conservative on purpose in the normal layout. When we find that we have a…
		davidxlUnsubmitted Not Done Reply Inline Actions It is not ideal, but my point still holds -- it is not a good idea to embed chain transformation code inside analysis code. If you want to improve the situation so that the analysis result can be better maintained -- that is fine -- but probably as a diferent patch. davidxl: It is not ideal, but my point still holds -- it is not a good idea to embed chain…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Then I'll do it the way I suggested. We already did the analysis for the other side of the lattice, we should save it, and then trust it when we get there. iteratee: Then I'll do it the way I suggested. We already did the analysis for the other side of the…
		if (Succ == ViableSuccs[0]) {
		if (BestA == BestEdges.end()) {
		BestA = It;
		continue;
		} else if (SecondBestA == BestEdges.end()) {
		jlebarUnsubmitted Done Reply Inline Actions Nit, I'd move this down to under the comment that says "Collect the edge frequencies". Especially since the "2" in the constructor only makes sense after the if statement below. jlebar: Nit, I'd move this down to under the comment that says "Collect the edge frequencies".
		SecondBestA = It;
		if (SecondBestB != BestEdges.end())
		break;
		continue;
		}
		jlebarUnsubmitted Done Reply Inline Actions Maybe "and a trellis of that size is basically unheard of"? jlebar: Maybe "and a trellis of that size is basically unheard of"?
		} else {
		if (BestB == BestEdges.end()) {
		BestB = It;
		continue;
		} else if (SecondBestB == BestEdges.end()) {
		SecondBestB = It;
		if (SecondBestA != BestEdges.end())
		break;
		continue;
		}
		}
		}
		assert (BestA != BestEdges.end() && BestB != BestEdges.end()
		&& SecondBestA != BestEdges.end() && SecondBestB != BestEdges.end()
		&& "Should have found predecessors for both successors.");
		// Arrange for the correct answer to be in BestA and BestB
		// If the 2 best edges don't conflict, the answer is already there.
		if (std::get<1>(BestA) == std::get<1>(BestB)) {
		// Compare the total fallthrough frequency for each second best and pick
		// the higher.
		BlockFrequency BestAScore = std::get<0>(BestA) + std::get<0>(SecondBestB);
		BlockFrequency BestBScore = std::get<0>(BestB) + std::get<0>(SecondBestA);
		if (BestAScore < BestBScore)
		jlebarUnsubmitted Not Done Reply Inline Actions Any reason you don't want to return a SmallVector from getBestNonConflictingEdges? Or even an std::pair? Then you could say WeightedEdge BestA, BestB; std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); which seems closer to what you mean. jlebar: Any reason you don't want to return a SmallVector from getBestNonConflictingEdges? Or even an…
		BestA = SecondBestA;
		else
		BestB = SecondBestB;
		}
		std::tuple<BlockFrequency, const MachineBasicBlock, MachineBasicBlock>
		BBEdge = nullptr, OtherEdge = nullptr;
		if (std::get<1>(*BestA) == BB) {
		BBEdge = &*BestA;
		OtherEdge = &*BestB;
		} else if (std::get<1>(*BestB) == BB) {
		BBEdge = &*BestB;
		OtherEdge = &*BestA;
		} else
		return nullptr;
		// If the lattice layout picked a triangle edge, merge that now, while we know
		// it is optimal. Otherwise it may get ignored because of CFG breaking later.
		if (std::get<1>(OtherEdge) == std::get<2>(BBEdge)) {
		auto Succ1 = std::get<2>(*BBEdge);
		auto Succ2 = std::get<2>(*OtherEdge);
		jlebarUnsubmitted Done Reply Inline Actions Personally I'd prefer not to use "auto" here. MachineBasicBlock* isn't too much to type out, and otherwise there's no obvious, nearby anchor for the reader as to what's going on. jlebar: Personally I'd prefer not to use "auto" here. MachineBasicBlock* isn't too much to type out…
		BlockToChain[Succ1]->merge(Succ2, BlockToChain[Succ2]);
		}
		return std::get<2>(*BBEdge);
		}

/// When the option TailDupPlacement is on, this method checks if the		/// When the option TailDupPlacement is on, this method checks if the
/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated		/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
/// into all of its unplaced, unfiltered predecessors, that are not BB.		/// into all of its unplaced, unfiltered predecessors, that are not BB.
bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(		bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
const MachineBasicBlock BB, MachineBasicBlock Succ,		const MachineBasicBlock BB, MachineBasicBlock Succ,
const BlockChain &Chain, const BlockFilterSet *BlockFilter) {		const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
if (!shouldTailDuplicate(Succ))		if (!shouldTailDuplicate(Succ))
return false;		return false;

		// For CFG checking.
		SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
for (MachineBasicBlock *Pred : Succ->predecessors()) {		for (MachineBasicBlock *Pred : Succ->predecessors()) {
// Make sure all unplaced and unfiltered predecessors can be		// Make sure all unplaced and unfiltered predecessors can be
// tail-duplicated into.		// tail-duplicated into.
// Skip any blocks that are already placed or not in this loop.		// Skip any blocks that are already placed or not in this loop.
if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(Pred))		if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(Pred))
\|\| BlockToChain[Pred] == &Chain)		\|\| BlockToChain[Pred] == &Chain)
continue;		continue;
if (!TailDup.canTailDuplicate(Succ, Pred))		if (!TailDup.canTailDuplicate(Succ, Pred)) {
		if (Successors.size() > 1
		&& hasSameSuccessors(*Pred, Successors))
		davidxlUnsubmitted Done Reply Inline Actions Is it better to relax the condition such that as long as Succ can be dup'd into one of the unplaced predecessors, return true? davidxl: Is it better to relax the condition such that as long as Succ can be dup'd into one of the…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions That isn't sufficient, because we need layout to be repeatable. So when we encounter the lattice the second time, there will be no more copies left to make. Chandler also wanted to allow another pass besides layout to do the duplication (perhaps with a larger threshold or different heuristic). iteratee: That isn't sufficient, because we need layout to be repeatable. So when we encounter the…
		jlebarUnsubmitted Done Reply Inline Actions Can we run clang-format over this patch? git-clang-format, included in the clang sources, can run it just over your changes, so you don't have to reformat the whole file. For instance, this line appears to fit in 80 chars, so doesn't need to be wrapped. Also we usually put "&&" at the end of the line, not the beginning. And there are some lines that appear to be longer than 80 chars. If it helps, I have a script that runs git-clang-format on every arc diff so I don't have to remember to do it myself. https://github.com/jlebar/conf/blob/master/bin/arc jlebar: Can we run clang-format over this patch? git-clang-format, included in the clang sources, can…
		// This will result in a lattice after tail duplication, so we don't
		// need to copy Succ into this predecessor. In the presence
		// of a lattice tail duplication can continue to be profitable.
		// For example:
		// A A
		// \|\ \|\
		davidxlUnsubmitted Done Reply Inline Actions Change the second 'C' to 'C (+ BB') ' where BB' is the dup of BB davidxl: Change the second 'C' to 'C (+ BB') ' where BB' is the dup of BB
		// \| \ \| \
		// \| C \| C+BB
		// \| / \| \|
		davidxlUnsubmitted Done Reply Inline Actions Better change 'B' to 'BB' to match the function argument name davidxl: Better change 'B' to 'BB' to match the function argument name
		// \|/ \| \|
		// BB => BB \|
		// \|\ \|\/\|
		// \| \ \|/\\|
		// \| D \| D
		// \| / \| /
		davidxlUnsubmitted Done Reply Inline Actions Change E to 'Succ' davidxl: Change E to 'Succ'
		// \|/ \|/
		// Succ Succ
		//
		// After BB was duplicated into C, the layout looks like the one on the
		// right. BB and C now have the same successors. When considering whether
		// Succ can be duplicated into all its unplaced predecessors, we ignore C.
		// We can do this because C already has a profitable fallthrough, namely
		// D. TODO(iteratee): ignore sufficiently cold predecessors for
		davidxlUnsubmitted Not Done Reply Inline Actions This comment does not seem to be correct. D can not be C's fall through. The lattice analysis has decided that D is the fall-through of BB. You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor. davidxl: This comment does not seem to be correct. D can not be C's fall through. The lattice analysis…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions I reread it. It's correct. D should be the fallthrough successor of (C+BB). You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor. I'm not sure what you mean by the above. The comment already says "unplaced predecessors" iteratee: I reread it. It's correct. D should be the fallthrough successor of (C+BB). > You won't need…
		davidxlUnsubmitted Not Done Reply Inline Actions For the lattice including BB, C+BB, Succ and D, only when {BestA, BestB} == { BB->D, D->Succ}, will the tail duplication check is called. Does it mean D can not be C+BB's successor? What I meant is you can pass 'D' as a parameter to this call and only check if Succ can be tail dup into D. davidxl: For the lattice including BB, C+BB, Succ and D, only when {BestA, BestB} == { BB->D, D->Succ}…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Yes, but if we tail-duplicate then C+BB has D as a fallthrough. That's why we can ignore it. iteratee: Yes, but if we tail-duplicate then C+BB has D as a fallthrough. That's why we can ignore it.
		// duplication and for this test.
		//
		// This allows lattices to be laid out in 2 separate chains
		// (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
		// because it allows the creation of 2 fallthrough paths with links
		// between them, and we correctly identify the best layout for these
		// CFGs. We want to extend lattices that the user created in addition to
		// lattices created by tail-duplication, so we just look for the CFG.
		continue;
return false;		return false;
}		}
		}
return true;		return true;
}		}

/// When the option OutlineOptionalBranches is on, this method		/// When the option OutlineOptionalBranches is on, this method
/// checks if the fallthrough candidate block \p Succ (of block		/// checks if the fallthrough candidate block \p Succ (of block
/// \p BB) also has other unscheduled predecessor blocks which		/// \p BB) also has other unscheduled predecessor blocks which
/// are also successors of \p BB (forming triangular shape CFG).		/// are also successors of \p BB (forming triangular shape CFG).
/// If none of such predecessors are small, it returns true.		/// If none of such predecessors are small, it returns true.
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines	bool MachineBlockPlacement::hasBetterLayoutPredecessor(
//		//
// S-------\| ---S		// S-------\| ---S
// \| \| \| \|		// \| \| \| \|
// ---BB \| \| BB		// ---BB \| \| BB
// \| \| \| \|		// \| \| \| \|
// \| Pred----\| \| S1----		// \| Pred----\| \| S1----
// \| \| \| \|		// \| \| \| \|
// --(S1 or S2) ---Pred--		// --(S1 or S2) ---Pred--
		// \|
		// S2
//		//
// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)		// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
// + min(freq(Pred->S1), freq(Pred->S2))		// + min(freq(Pred->S1), freq(Pred->S2))
// Non-topo-order cost:		// Non-topo-order cost:
// In the worst case, S2 will not get laid out after Pred.
// non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).		// non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))		// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
// is 0. Then the non topo layout is better when		// is 0. Then the non topo layout is better when
// freq(S->Pred) < freq(BB->S1).		// freq(S->Pred) < freq(BB->S1).
// This is exactly what is checked below.		// This is exactly what is checked below.
// Note there are other shapes that apply (Pred may not be a single block,		// Note there are other shapes that apply (Pred may not be a single block,
// but they all fit this general pattern.)		// but they all fit this general pattern.)
BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);		BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);

// Make sure that a hot successor doesn't have a globally more		// Make sure that a hot successor doesn't have a globally more
// important predecessor.		// important predecessor.
BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;		BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
bool BadCFGConflict = false;		bool BadCFGConflict = false;

for (MachineBasicBlock *Pred : Succ->predecessors()) {		for (MachineBasicBlock *Pred : Succ->predecessors()) {
if (Pred == Succ \|\| BlockToChain[Pred] == &SuccChain \|\|		if (Pred == Succ \|\| BlockToChain[Pred] == &SuccChain \|\|
(BlockFilter && !BlockFilter->count(Pred)) \|\|		(BlockFilter && !BlockFilter->count(Pred)) \|\|
BlockToChain[Pred] == &Chain \|\|		BlockToChain[Pred] == &Chain \|\|
// This check is redundant except for look ahead. This function is		// This check is redundant except for look ahead. This function is
// called for lookahead by isProfitableToTailDup when BB hasn't been		// called for lookahead by isProfitableToTailDup when BB hasn't been
// placed yet.		// placed yet.
(Pred == BB))		(Pred == BB))
continue;		continue;
// Do backward checking.		// Do backward checking.
		davidxlUnsubmitted Done Reply Inline Actions unrelated change? davidxl: unrelated change?
// For all cases above, we need a backward checking to filter out edges that		// For all cases above, we need a backward checking to filter out edges that
// are not 'strongly' biased.		// are not 'strongly' biased.
// BB Pred		// BB Pred
// \ /		// \ /
// Succ		// Succ
// We select edge BB->Succ if		// We select edge BB->Succ if
// freq(BB->Succ) > freq(Succ) * HotProb		// freq(BB->Succ) > freq(Succ) * HotProb
// i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *		// i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
Show All 38 Lines	MachineBlockPlacement::selectBestSuccessor(
auto BestProb = BranchProbability::getZero();		auto BestProb = BranchProbability::getZero();

SmallVector<MachineBasicBlock *, 4> Successors;		SmallVector<MachineBasicBlock *, 4> Successors;
auto AdjustedSumProb =		auto AdjustedSumProb =
collectViableSuccessors(BB, Chain, BlockFilter, Successors);		collectViableSuccessors(BB, Chain, BlockFilter, Successors);

DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");		DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");

		// if BB is part of a lattice, Use the lattice to determine the optimal
		davidxlUnsubmitted Not Done Reply Inline Actions Extract the special handling of lattice code into a helper to make the main flow of the caller cleaner. davidxl: Extract the special handling of lattice code into a helper to make the main flow of the caller…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions It already is in 2 helpers. Ignoring the logging the logic is: if (isLattice) return getBestLattice(); I don't see how a helper would be useful in this case. iteratee: It already is in 2 helpers. Ignoring the logging the logic is: if (isLattice) return…
		davidxlUnsubmitted Not Done Reply Inline Actions I suggest pushing isLattice check into getBestLatticeSuccessor call. Also pushing the debug tracking code there too. The caller will be like if (LatticeSucc = getBestLatticeSuccessor(...)) { return BlockAndTailDupResult(LatticeSucc, false}' } davidxl: I suggest pushing isLattice check into getBestLatticeSuccessor call. Also pushing the debug…
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Keeping them separate makes the early return logic simpler, but I moved the debugging code. iteratee: Keeping them separate makes the early return logic simpler, but I moved the debugging code.
		jlebarUnsubmitted Not Done Reply Inline Actions Nit, capital letter jlebar: Nit, capital letter
		// fallthrough edges
		if (isLattice(BB, Successors, Chain, BlockFilter)) {
		MachineBasicBlock *LatticeSucc =
		getBestLatticeSuccessor(BB, Successors, Chain, BlockFilter);
		if (LatticeSucc != nullptr) {
		jlebarUnsubmitted Done Reply Inline Actions Nit, we usually omit these braces, even though the if-body is multiline. jlebar: Nit, we usually omit these braces, even though the if-body is multiline.
		auto RealSuccProb = MBPI->getEdgeProbability(BB, LatticeSucc);
		BranchProbability SuccProb =
		jlebarUnsubmitted Not Done Reply Inline Actions This is unrelated to your patch, so you don't need to change it or anything, but if we always check `!BlockFilter \|\| BlockFilter->count(Foo)`, then shouldn't we just pass BlockFilter by reference and initialize it to an empty map when necessary? This would be much more ergonomic. jlebar: This is unrelated to your patch, so you don't need to change it or anything, but if we always…
		getAdjustedProbability(RealSuccProb, AdjustedSumProb);
		DEBUG(dbgs() << " Candidate: " << getBlockName(LatticeSucc)
		<< ", probability: " << SuccProb << " (Lattice)\n");
		BestSucc.BB = LatticeSucc;
		return BestSucc;
		}
		// If we have a lattice, and BB doesn't have the best fallthrough edges,
		jlebarUnsubmitted Not Done Reply Inline Actions Capital "i", lower-case "U", period. jlebar: Capital "i", lower-case "U", period.
		// we should stop now. We've already looked and there's a better fallthrough
		// edge for all the successors.
		DEBUG(dbgs() << "Lattice, but not one of the chosen edges.\n");
		return BestSucc;
		davidxlUnsubmitted Not Done Reply Inline Actions Is this null? can you explain this code? davidxl: Is this null? can you explain this code?
		iterateeAuthorUnsubmitted Not Done Reply Inline Actions Yes. If BB is part of a lattice, but not an optimal edge, then we return early. We've already determined that all of BB's successors have a better fallthrough predecessor. iteratee: Yes. If BB is part of a lattice, but not an optimal edge, then we return early. We've already…
		}

// For blocks with CFG violations, we may be able to lay them out anyway with		// For blocks with CFG violations, we may be able to lay them out anyway with
// tail-duplication. We keep this vector so we can perform the probability		// tail-duplication. We keep this vector so we can perform the probability
// calculations the minimum number of times.		// calculations the minimum number of times.
SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>		SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
DupCandidates;		DupCandidates;
for (MachineBasicBlock *Succ : Successors) {		for (MachineBasicBlock *Succ : Successors) {
auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);		auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
BranchProbability SuccProb =		BranchProbability SuccProb =
getAdjustedProbability(RealSuccProb, AdjustedSumProb);		getAdjustedProbability(RealSuccProb, AdjustedSumProb);

// This heuristic is off by default.		// This heuristic is off by default.
if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,		if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
HotProb)) {		HotProb)) {
BestSucc.BB = Succ;		BestSucc.BB = Succ;
return BestSucc;		return BestSucc;
}		}

BlockChain &SuccChain = *BlockToChain[Succ];		BlockChain &SuccChain = *BlockToChain[Succ];
// Skip the edge \c BB->Succ if block \c Succ has a better layout		// Skip the edge \c BB->Succ if block \c Succ has a better layout
		davidxlUnsubmitted Done Reply Inline Actions unrelated change? davidxl: unrelated change?
// predecessor that yields lower global cost.		// predecessor that yields lower global cost.
if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,		if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
Chain, BlockFilter)) {		Chain, BlockFilter)) {
// If tail duplication would make Succ profitable, place it.		// If tail duplication would make Succ profitable, place it.
if (TailDupPlacement && shouldTailDuplicate(Succ))		if (TailDupPlacement && shouldTailDuplicate(Succ))
DupCandidates.push_back(std::make_tuple(SuccProb, Succ));		DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
continue;		continue;
}		}
▲ Show 20 Lines • Show All 1,358 Lines • Show Last 20 Lines

test/CodeGen/AArch64/branch-relax-cbz.ll

	; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-cbz-offset-bits=3 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-cbz-offset-bits=3 < %s \| FileCheck %s

	; CHECK-LABEL: _split_block_no_fallthrough:			; CHECK-LABEL: _split_block_no_fallthrough:
	; CHECK: cmn x{{[0-9]+}}, #5			; CHECK: cmn x{{[0-9]+}}, #5
	; CHECK-NEXT: b.le [[B2:LBB[0-9]+_[0-9]+]]			; CHECK-NEXT: b.le [[B2:LBB[0-9]+_[0-9]+]]

	; CHECK-NEXT: ; BB#1: ; %b3			; CHECK-NEXT: ; BB#1: ; %b3
	; CHECK: ldr [[LOAD:w[0-9]+]]			; CHECK: ldr [[LOAD:w[0-9]+]]
	; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]			; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
	; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]

	; CHECK-NEXT: [[SKIP_LONG_B]]:
	; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]			; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]

				; CHECK-NEXT: [[B8]]: ; %b8
				; CHECK-NEXT: ret

	; CHECK-NEXT: [[B2]]: ; %b2			; CHECK-NEXT: [[B2]]: ; %b2
	; CHECK: mov w{{[0-9]+}}, #93			; CHECK: mov w{{[0-9]+}}, #93
	; CHECK: bl _extfunc			; CHECK: bl _extfunc
	; CHECK: cbz w{{[0-9]+}}, [[B7]]			; CHECK: cbz w{{[0-9]+}}, [[B7]]
				; CHECK-NEXT: b [[B8]]
	; CHECK-NEXT: [[B8]]: ; %b8
	; CHECK-NEXT: ret

	; CHECK-NEXT: [[B7]]: ; %b7			; CHECK-NEXT: [[B7]]: ; %b7
	; CHECK: mov w{{[0-9]+}}, #13			; CHECK: mov w{{[0-9]+}}, #13
	; CHECK: b _extfunc			; CHECK: b _extfunc

	define void @split_block_no_fallthrough(i64 %val) #0 {			define void @split_block_no_fallthrough(i64 %val) #0 {
	bb:			bb:
	%c0 = icmp sgt i64 %val, -5			%c0 = icmp sgt i64 %val, -5
	br i1 %c0, label %b3, label %b2			br i1 %c0, label %b3, label %b2

	b2:			b2:
	%v0 = tail call i32 @extfunc(i32 93)			%v0 = tail call i32 @extfunc(i32 93)
	%c1 = icmp eq i32 %v0, 0			%c1 = icmp eq i32 %v0, 0
	Show All 18 Lines

test/CodeGen/AArch64/combine-comparisons-by-cse.ll

	Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines
	}			}

	; undefined external to prevent possible optimizations			; undefined external to prevent possible optimizations
	declare void @do_something() #1			declare void @do_something() #1

	define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {			define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
	; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ			; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
	; CHECK: cmn			; CHECK: cmn
	; CHECK: b.gt			; CHECK: b.le
	; CHECK: cmp			; CHECK: cmp
	; CHECK: b.gt			; CHECK: b.gt
	entry:			entry:
	%0 = load i32, i32* @a, align 4			%0 = load i32, i32* @a, align 4
	%cmp4 = icmp slt i32 %0, -1			%cmp4 = icmp slt i32 %0, -1
	br i1 %cmp4, label %while.body.preheader, label %while.end			br i1 %cmp4, label %while.body.preheader, label %while.end

	while.body.preheader: ; preds = %entry			while.body.preheader: ; preds = %entry
	▲ Show 20 Lines • Show All 199 Lines • Show Last 20 Lines

test/CodeGen/AArch64/optimize-cond-branch.ll

	; RUN: llc -verify-machineinstrs -o - %s \| FileCheck %s			; RUN: llc -verify-machineinstrs -o - %s \| FileCheck %s
	target triple = "arm64--"			target triple = "arm64--"

	; AArch64InstrInfo::optimizeCondBranch() optimizes the			; AArch64InstrInfo::optimizeCondBranch() optimizes the
	; "x = and y, 256; cmp x, 0; br" from an "and; cbnz" to a tbnz instruction.			; "x = and y, 256; cmp x, 0; br" from an "and; cbnz" to a tbnz instruction.
	; It forgot to clear the a flag resulting in a MachineVerifier complaint.			; It forgot to clear the a flag resulting in a MachineVerifier complaint.
	;			;
	; Writing a stable/simple test is tricky since most tbz instructions are already			; Writing a stable/simple test is tricky since most tbz instructions are already
	; formed in SelectionDAG, optimizeCondBranch() only triggers if the and			; formed in SelectionDAG, optimizeCondBranch() only triggers if the and
	; instruction is in a different block than the conditional jump.			; instruction is in a different block than the conditional jump.
	;			;
	; CHECK-LABEL: func			; CHECK-LABEL: func
	; CHECK-NOT: and			; CHECK-NOT: and
	; CHECK: tbnz			; CHECK: tbz
	define void @func() {			define void @func() {
	%c0 = icmp sgt i64 0, 0			%c0 = icmp sgt i64 0, 0
	br i1 %c0, label %b1, label %b6			br i1 %c0, label %b1, label %b6

	b1:			b1:
	br i1 undef, label %b3, label %b2			br i1 undef, label %b3, label %b2

	b2:			b2:
	Show All 26 Lines

test/CodeGen/AMDGPU/basic-branch.ll

	; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s			; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
	; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s			; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s

	; GCN-LABEL: {{^}}test_branch:			; GCN-LABEL: {{^}}test_branch:
	; GCNNOOPT: v_writelane_b32			; GCNNOOPT: v_writelane_b32
	; GCNNOOPT: v_writelane_b32			; GCNNOOPT: v_writelane_b32
	; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]


	; GCN: ; BB#1
	; GCNNOOPT: v_readlane_b32			; GCNNOOPT: v_readlane_b32
	; GCNNOOPT: v_readlane_b32			; GCNNOOPT: v_readlane_b32
	; GCN: buffer_store_dword			; GCN: buffer_store_dword
	; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)			; GCNNOOPT: s_endpgm
	; TODO: This waitcnt can be eliminated

	; GCN: {{^}}[[END]]:			; GCN: {{^}}[[END]]:
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {			define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
	%cmp = icmp ne i32 %val, 0			%cmp = icmp ne i32 %val, 0
	br i1 %cmp, label %store, label %end			br i1 %cmp, label %store, label %end

	store:			store:
	Show All 32 Lines

test/CodeGen/AMDGPU/branch-relaxation.ll

	Show First 20 Lines • Show All 485 Lines • ▼ Show 20 Lines

	ret:			ret:
	store volatile i32 7, i32 addrspace(1)* undef			store volatile i32 7, i32 addrspace(1)* undef
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}long_branch_hang:			; GCN-LABEL: {{^}}long_branch_hang:
	; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6			; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
	; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
				; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
	; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:			; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:

	; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(			; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
	; GCN: s_setpc_b64			; GCN: s_setpc_b64

	; GCN-NEXT: [[LONG_BR_0]]:			; GCN-NEXT: [[LONG_BR_0]]:
	; GCN-DAG: v_cmp_lt_i32			; GCN-DAG: v_cmp_lt_i32
	; GCN-DAG: v_cmp_gt_i32			; GCN-DAG: v_cmp_gt_i32
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/cf-loop-on-constant.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s			; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s

	; GCN-LABEL: {{^}}test_loop:			; GCN-LABEL: {{^}}test_loop:
	; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:			; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
	; GCN: ds_read_b32			; GCN: ds_read_b32
	; GCN: ds_write_b32			; GCN: ds_write_b32
	; GCN: s_branch [[LABEL]]			; GCN: s_branch [[LABEL]]
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {			define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
	entry:			entry:
	%cmp = icmp eq i32 %n, -1			%cmp = icmp eq i32 %n, -1
	br i1 %cmp, label %for.exit, label %for.body			br i1 %cmp, label %for.exit, label %for.body
	▲ Show 20 Lines • Show All 110 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/convergent-inlineasm.ll

	Show All 23 Lines

	; GCN-LABEL: {{^}}nonconvergent_inlineasm:			; GCN-LABEL: {{^}}nonconvergent_inlineasm:
	; GCN: ; mask branch			; GCN: ; mask branch

	; GCN: BB{{[0-9]+_[0-9]+}}:			; GCN: BB{{[0-9]+_[0-9]+}}:
	; GCN: v_cmp_ne_u32_e64			; GCN: v_cmp_ne_u32_e64

	; GCN: BB{{[0-9]+_[0-9]+}}:			; GCN: BB{{[0-9]+_[0-9]+}}:

	define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {			define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
	bb:			bb:
	%tmp = call i32 @llvm.amdgcn.workitem.id.x()			%tmp = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)			%tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
	%tmp2 = icmp eq i32 %tmp, 8			%tmp2 = icmp eq i32 %tmp, 8
	br i1 %tmp2, label %bb3, label %bb5			br i1 %tmp2, label %bb3, label %bb5

	bb3: ; preds = %bb			bb3: ; preds = %bb
	Show All 10 Lines

test/CodeGen/AMDGPU/salu-to-valu.ll

	Show First 20 Lines • Show All 433 Lines • ▼ Show 20 Lines
	; {{^}}sopc_vopc_legalize_bug:			; {{^}}sopc_vopc_legalize_bug:
	; GCN: s_load_dword [[SGPR:s[0-9]+]]			; GCN: s_load_dword [[SGPR:s[0-9]+]]
	; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}			; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
	; GCN: s_and_b64 vcc, exec, vcc			; GCN: s_and_b64 vcc, exec, vcc
	; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]			; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
	; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1			; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
	; GCN-NOHSA: buffer_store_dword [[ONE]]			; GCN-NOHSA: buffer_store_dword [[ONE]]
	; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]			; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
	; GCN; {{^}}[[EXIT]]:			; GCN: {{^}}[[EXIT]]:
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {			define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
	bb3: ; preds = %bb2			bb3: ; preds = %bb2
	%tmp0 = bitcast i32 %cond to float			%tmp0 = bitcast i32 %cond to float
	%tmp1 = fadd float %tmp0, 2.500000e-01			%tmp1 = fadd float %tmp0, 2.500000e-01
	%tmp2 = bitcast float %tmp1 to i32			%tmp2 = bitcast float %tmp1 to i32
	%tmp3 = icmp ult i32 %tmp2, %cond			%tmp3 = icmp ult i32 %tmp2, %cond
	br i1 %tmp3, label %bb6, label %bb7			br i1 %tmp3, label %bb6, label %bb7
	▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines

test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

	; RUN: llc < %s -march=arm \| FileCheck %s			; RUN: llc < %s -march=arm \| FileCheck %s
	; RUN: llc < %s -march=arm -enable-tail-merge=0 \| \			; RUN: llc < %s -march=arm -enable-tail-merge=0 \| \
	; RUN: FileCheck --check-prefix=NOMERGE %s			; RUN: FileCheck --check-prefix=NOMERGE %s

	; Check that tail merging is the default on ARM, and that -enable-tail-merge=0			; Check that tail merging is the default on ARM, and that -enable-tail-merge=0
	; works.			; works.
	; PR1628			; PR1628

	; CHECK: bl _baz			; CHECK: bl _baz
	; CHECK-NOT: bl _baz			; CHECK-NOT: bl _baz

	; CHECK: bl _quux			; CHECK: bl _quux
	; CHECK-NOT: bl _quux			; CHECK-NOT: bl _quux

	; NOMERGE: bl _baz			; NOMERGE-DAG: bl _baz
	; NOMERGE: bl _baz			; NOMERGE-DAG: bl _baz

	; NOMERGE: bl _quux			; NOMERGE-DAG: bl _quux
	; NOMERGE: bl _quux			; NOMERGE-DAG: bl _quux

	; ModuleID = 'tail.c'			; ModuleID = 'tail.c'
	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
	target triple = "i686-apple-darwin8"			target triple = "i686-apple-darwin8"

	define i32 @f(i32 %i, i32 %q) {			define i32 @f(i32 %i, i32 %q) {
	entry:			entry:
	%i_addr = alloca i32 ; <i32*> [#uses=2]			%i_addr = alloca i32 ; <i32*> [#uses=2]
	▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

test/CodeGen/ARM/atomic-cmpxchg.ll

	Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines

	; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:			; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-ARMV7-NEXT: .fnstart			; CHECK-ARMV7-NEXT: .fnstart
	; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1			; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]			; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]
	; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:			; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
	; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]			; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0			; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
	; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1			; CHECK-ARMV7-NEXT: moveq r0, #1
	; CHECK-ARMV7-NEXT: bxeq lr			; CHECK-ARMV7-NEXT: bxeq lr
	; CHECK-ARMV7-NEXT: [[TRY]]:			; CHECK-ARMV7-NEXT: [[TRY]]:
	; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]			; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
	; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]			; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
	; CHECK-ARMV7-NEXT: beq [[HEAD]]			; CHECK-ARMV7-NEXT: beq [[HEAD]]
	; CHECK-ARMV7-NEXT: clrex			; CHECK-ARMV7-NEXT: clrex
	; CHECK-ARMV7-NEXT: mov [[RES]], #0			; CHECK-ARMV7-NEXT: mov r0, #0
	; CHECK-ARMV7-NEXT: bx lr			; CHECK-ARMV7-NEXT: bx lr

	; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:			; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-THUMBV7-NEXT: .fnstart			; CHECK-THUMBV7-NEXT: .fnstart
	; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1			; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]			; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
	; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:			; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:
	; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]			; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	Show All 11 Lines

test/CodeGen/ARM/fold-stack-adjust.ll

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines

	; PR18136: there was a bug determining where the first eligible pop in a			; PR18136: there was a bug determining where the first eligible pop in a
	; basic-block was when the entire block was epilogue code.			; basic-block was when the entire block was epilogue code.
	define void @test_fold_point(i1 %tst) minsize {			define void @test_fold_point(i1 %tst) minsize {
	; CHECK-LABEL: test_fold_point:			; CHECK-LABEL: test_fold_point:

	; Important to check for beginning of basic block, because if it gets			; Important to check for beginning of basic block, because if it gets
	; if-converted the test is probably no longer checking what it should.			; if-converted the test is probably no longer checking what it should.
	; CHECK: {{LBB[0-9]+_2}}:			; CHECK: %end
	; CHECK-NEXT: vpop {d7, d8}			; CHECK-NEXT: vpop {d7, d8}
	; CHECK-NEXT: pop {r4, pc}			; CHECK-NEXT: pop {r4, pc}

	; With a guaranteed frame-pointer, we want to make sure that its offset in the			; With a guaranteed frame-pointer, we want to make sure that its offset in the
	; push block is correct, even if a few registers have been tacked onto a later			; push block is correct, even if a few registers have been tacked onto a later
	; vpush (PR18160).			; vpush (PR18160).
	; CHECK-IOS-LABEL: test_fold_point:			; CHECK-IOS-LABEL: test_fold_point:
	; CHECK-IOS: push {r4, r7, lr}			; CHECK-IOS: push {r4, r7, lr}
	▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

test/CodeGen/PowerPC/tail-dup-break-cfg.ll

Show All 10 Lines
; exit		; exit

;CHECK-LABEL: tail_dup_break_cfg:		;CHECK-LABEL: tail_dup_break_cfg:
;CHECK: mr [[TAGREG:[0-9]+]], 3		;CHECK: mr [[TAGREG:[0-9]+]], 3
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1		;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]		;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: # %test2		;CHECK-NEXT: # %test2
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30		;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]		;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]		;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
		;CHECK: blr
;CHECK-NEXT: [[BODY1LABEL]]		;CHECK-NEXT: [[BODY1LABEL]]
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30		;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
;CHECK-NEXT: beq 0, [[EXITLABEL]]		;CHECK-NEXT: beq 0, [[EXITLABEL]]
;CHECK-NEXT: [[BODY2LABEL]]		;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit		;CHECK: b [[EXITLABEL]]
;CHECK: blr
define void @tail_dup_break_cfg(i32 %tag) {		define void @tail_dup_break_cfg(i32 %tag) {
entry:		entry:
br label %test1		br label %test1
test1:		test1:
%tagbit1 = and i32 %tag, 1		%tagbit1 = and i32 %tag, 1
%tagbit1eq0 = icmp eq i32 %tagbit1, 0		%tagbit1eq0 = icmp eq i32 %tagbit1, 0
br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely		br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
body1:		body1:
Show All 39 Lines	body1:
call void @a()		call void @a()
call void @a()		call void @a()
call void @a()		call void @a()
call void @a()		call void @a()
br label %test2		br label %test2
test2:		test2:
%tagbit2 = and i32 %tag, 2		%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp ne i32 %tagbit2, 0		%tagbit2eq0 = icmp ne i32 %tagbit2, 0
br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely		br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
body2:		body2:
call void @b()		call void @b()
call void @b()		call void @b()
call void @b()		call void @b()
call void @b()		call void @b()
br label %exit		br label %exit
exit:		exit:
ret void		ret void
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	v:
br label %ret		br label %ret
ret:		ret:
ret void		ret void
}		}


!1 = !{!"branch_weights", i32 5, i32 3}		!1 = !{!"branch_weights", i32 5, i32 3}
!2 = !{!"branch_weights", i32 95, i32 5}		!2 = !{!"branch_weights", i32 95, i32 5}
!3 = !{!"branch_weights", i32 7, i32 3}		!3 = !{!"branch_weights", i32 8, i32 3}

test/CodeGen/PowerPC/tail-dup-layout.ll

	; RUN: llc -outline-optional-branches -O2 < %s \| FileCheck %s			; RUN: llc -O2 < %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-n32:64"			target datalayout = "e-m:e-i64:64-n32:64"
	target triple = "powerpc64le-grtev4-linux-gnu"			target triple = "powerpc64le-grtev4-linux-gnu"

	; Intended layout:			; Intended layout:
	; The outlining flag produces the layout			; The chain-based outlining produces the layout
	; test1			; test1
	; test2			; test2
	; test3			; test3
	; test4			; test4
	; exit
	; optional1			; optional1
	; optional2			; optional2
	; optional3			; optional3
	; optional4			; optional4
				; exit
	; Tail duplication puts test n+1 at the end of optional n			; Tail duplication puts test n+1 at the end of optional n
	; so optional1 includes a copy of test2 at the end, and branches			; so optional1 includes a copy of test2 at the end, and branches
	; to test3 (at the top) or falls through to optional 2.			; to test3 (at the top) or falls through to optional 2.
	; The CHECK statements check for the whole string of tests and exit block,			; The CHECK statements check for the whole string of tests
	; and then check that the correct test has been duplicated into the end of			; and then check that the correct test has been duplicated into the end of
	; the optional blocks and that the optional blocks are in the correct order.			; the optional blocks and that the optional blocks are in the correct order.
	;CHECK-LABEL: f:			;CHECK-LABEL: straight_test:
	; test1 may have been merged with entry			; test1 may have been merged with entry
	;CHECK: mr [[TAGREG:[0-9]+]], 3			;CHECK: mr [[TAGREG:[0-9]+]], 3
	;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1			;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
	;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2			;CHECK-NEXT: # %test2
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
	;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3			;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
	;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4			;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
	;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit			;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
	;CHECK: blr			;CHECK: blr
	;CHECK-NEXT: [[OPT1LABEL]]			;CHECK-NEXT: .[[OPT1LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
	;CHECK-NEXT: beq 0, [[TEST3LABEL]]			;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
	;CHECK-NEXT: [[OPT2LABEL]]			;CHECK-NEXT: .[[OPT2LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
	;CHECK-NEXT: beq 0, [[TEST4LABEL]]			;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
	;CHECK-NEXT: [[OPT3LABEL]]			;CHECK-NEXT: .[[OPT3LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
	;CHECK-NEXT: beq 0, [[EXITLABEL]]			;CHECK-NEXT: beq 0, .[[EXITLABEL]]
	;CHECK-NEXT: [[OPT4LABEL]]			;CHECK-NEXT: .[[OPT4LABEL]]:
	;CHECK: b [[EXITLABEL]]			;CHECK: b .[[EXITLABEL]]

	define void @f(i32 %tag) {			define void @straight_test(i32 %tag) {
	entry:			entry:
	br label %test1			br label %test1
	test1:			test1:
	%tagbit1 = and i32 %tag, 1			%tagbit1 = and i32 %tag, 1
	%tagbit1eq0 = icmp eq i32 %tagbit1, 0			%tagbit1eq0 = icmp eq i32 %tagbit1, 0
	br i1 %tagbit1eq0, label %test2, label %optional1			br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
	optional1:			optional1:
	call void @a()			call void @a()
	call void @a()			call void @a()
	call void @a()			call void @a()
	call void @a()			call void @a()
	br label %test2			br label %test2
	test2:			test2:
	%tagbit2 = and i32 %tag, 2			%tagbit2 = and i32 %tag, 2
	%tagbit2eq0 = icmp eq i32 %tagbit2, 0			%tagbit2eq0 = icmp eq i32 %tagbit2, 0
	br i1 %tagbit2eq0, label %test3, label %optional2			br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
	optional2:			optional2:
	call void @b()			call void @b()
	call void @b()			call void @b()
	call void @b()			call void @b()
	call void @b()			call void @b()
	br label %test3			br label %test3
	test3:			test3:
	%tagbit3 = and i32 %tag, 4			%tagbit3 = and i32 %tag, 4
	%tagbit3eq0 = icmp eq i32 %tagbit3, 0			%tagbit3eq0 = icmp eq i32 %tagbit3, 0
	br i1 %tagbit3eq0, label %test4, label %optional3			br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
	optional3:			optional3:
	call void @c()			call void @c()
	call void @c()			call void @c()
	call void @c()			call void @c()
	call void @c()			call void @c()
	br label %test4			br label %test4
	test4:			test4:
	%tagbit4 = and i32 %tag, 8			%tagbit4 = and i32 %tag, 8
	%tagbit4eq0 = icmp eq i32 %tagbit4, 0			%tagbit4eq0 = icmp eq i32 %tagbit4, 0
	br i1 %tagbit4eq0, label %exit, label %optional4			br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
	optional4:			optional4:
	call void @d()			call void @d()
	call void @d()			call void @d()
	call void @d()			call void @d()
	call void @d()			call void @d()
	br label %exit			br label %exit
	exit:			exit:
	ret void			ret void
	}			}

				; Intended layout:
				; The chain-based outlining produces the layout
				; entry
				; --- Begin loop ---
				; for.latch
				; for.check
				; test1
				; test2
				; test3
				; test4
				; optional1
				; optional2
				; optional3
				; optional4
				; --- End loop ---
				; exit
				; The CHECK statements check for the whole string of tests and exit block,
				; and then check that the correct test has been duplicated into the end of
				; the optional blocks and that the optional blocks are in the correct order.
				;CHECK-LABEL: loop_test:
				;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
				;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
				;CHECK: addi
				;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
				;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
				;CHECK: # %test1
				;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
				;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: # %test2
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
				;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
				;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4\|optional3)}}
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
				;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
				;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
				;CHECK: [[OPT1LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
				;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
				;CHECK-NEXT: .[[OPT2LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
				;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
				;CHECK-NEXT: .[[OPT3LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
				;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
				;CHECK: [[OPT4LABEL]]:
				;CHECK: b .[[LATCHLABEL]]
				define void @loop_test(i32* %tags, i32 %count) {
				entry:
				br label %for.check
				for.check:
				%count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
				%done.count = icmp ugt i32 %count.loop, 0
				%tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
				%tag = load i32, i32* %tag_ptr
				%done.tag = icmp eq i32 %tag, 0
				%done = and i1 %done.count, %done.tag
				br i1 %done, label %test1, label %exit, !prof !1
				test1:
				%tagbit1 = and i32 %tag, 1
				%tagbit1eq0 = icmp eq i32 %tagbit1, 0
				br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
				optional1:
				call void @a()
				call void @a()
				call void @a()
				call void @a()
				br label %test2
				test2:
				%tagbit2 = and i32 %tag, 2
				%tagbit2eq0 = icmp eq i32 %tagbit2, 0
				br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
				optional2:
				call void @b()
				call void @b()
				call void @b()
				call void @b()
				br label %test3
				test3:
				%tagbit3 = and i32 %tag, 4
				%tagbit3eq0 = icmp eq i32 %tagbit3, 0
				br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
				optional3:
				call void @c()
				call void @c()
				call void @c()
				call void @c()
				br label %test4
				test4:
				%tagbit4 = and i32 %tag, 8
				%tagbit4eq0 = icmp eq i32 %tagbit4, 0
				br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
				optional4:
				call void @d()
				call void @d()
				call void @d()
				call void @d()
				br label %for.latch
				for.latch:
				%count.sub = sub i32 %count.loop, 1
				br label %for.check
				exit:
				ret void
				}

				; The block then2 is not unavoidable, but since it can be tail-duplicated, it
				davidxlUnsubmitted Done Reply Inline Actions The term 'unavoidable' is not well defined -- why is 'then2' unavoidable? davidxl: The term 'unavoidable' is not well defined -- why is 'then2' unavoidable?
				; should be placed as a fallthrough from test2 and copied.
				; CHECK-LABEL: avoidable_test:
				; CHECK: # %entry
				; CHECK: andi.
				; CHECK: # %test2
				; Make sure then2 falls through from test2
				; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
				; CHECK: # %then2
				; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
				; CHECK: # %end2
				; CHECK: # %else1
				; CHECK: bl a
				; CHECK: bl a
				; Make sure then2 was copied into else1
				; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
				; CHECK: # %end1
				; CHECK: bl d
				; CHECK: # %else2
				; CHECK: bl c
				define void @avoidable_test(i32 %tag) {
				entry:
				br label %test1
				test1:
				%tagbit1 = and i32 %tag, 1
				%tagbit1eq0 = icmp eq i32 %tagbit1, 0
				br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
				else1:
				call void @a()
				call void @a()
				br label %then2
				test2:
				%tagbit2 = and i32 %tag, 2
				%tagbit2eq0 = icmp eq i32 %tagbit2, 0
				br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
				then2:
				%tagbit3 = and i32 %tag, 4
				%tagbit3eq0 = icmp eq i32 %tagbit3, 0
				br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
				else2:
				call void @c()
				br label %end2
				end2:
				ret void
				end1:
				call void @d()
				ret void
				}

				; CHECK-LABEL: lattice_test
				; The most important edge here is f->ret. It's twice as hot as the rest
				davidxlUnsubmitted Not Done Reply Inline Actions Is this comment relevant here? davidxl: Is this comment relevant here?
				iterateeAuthorUnsubmitted Not Done Reply Inline Actions Yes. iteratee: Yes.
				; We should also see:
				; entry -> (b\|c)
				; b -> (d\|e)
				; c -> (d\|e)
				; (d\|e) -> f
				; CHECK: # %entry
				; CHECK: # %c
				; CHECK: # %d
				; CHECK: # %f
				; CHECK: # %ret
				; CHECK: # %b
				; CHECK: # %e
				define void @lattice_test(i32 %tag) {
				davidxlUnsubmitted Done Reply Inline Actions change name to trellis_test davidxl: change name to trellis_test
				entry:
				br label %a
				a:
				call void @a()
				call void @a()
				%tagbits.a = and i32 %tag, 3
				%tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
				br i1 %tagbits.a.eq0, label %c, label %b, !prof !2 ; balanced
				davidxlUnsubmitted Not Done Reply Inline Actions Using non-equal branch probability here to make the result more obvious in different scenarios: there are conflict in best incoming edges there are no conflict etc. davidxl: Using non-equal branch probability here to make the result more obvious in different scenarios…
				iterateeAuthorUnsubmitted Not Done Reply Inline Actions The test is now larger. It handles conflicting incoming edges, and a couple of non-conflicting edges, and a triangle. iteratee: The test is now larger. It handles conflicting incoming edges, and a couple of non-conflicting…
				c:
				call void @c()
				call void @c()
				%tagbits.c = and i32 %tag, 12
				%tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
				br i1 %tagbits.c.eq0, label %e, label %d, !prof !2 ; balanced
				e:
				call void @e()
				call void @e()
				%tagbits.e = and i32 %tag, 48
				%tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
				br i1 %tagbits.e.eq0, label %f, label %ret, !prof !2 ; balanced
				b:
				call void @b()
				call void @b()
				%tagbits.b = and i32 %tag, 12
				%tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
				br i1 %tagbits.b.eq1, label %d, label %e, !prof !2 ; balanced
				d:
				call void @d()
				call void @d()
				%tagbits.d = and i32 %tag, 48
				%tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
				br i1 %tagbits.d.eq1, label %f, label %ret, !prof !2 ; balanced
				f:
				call void @f()
				call void @f()
				br label %ret
				ret:
				ret void
				davidxlUnsubmitted Not Done Reply Inline Actions We probably also need a test for trellis+triangle shape (without taildup) davidxl: We probably also need a test for trellis+triangle shape (without taildup)
				iterateeAuthorUnsubmitted Not Done Reply Inline Actions f->ret is a triangle edge. I'll make the test bigger with non-balanced edges to cover more scenarios. iteratee: f->ret is a triangle edge. I'll make the test bigger with non-balanced edges to cover more…
				}


	declare void @a()			declare void @a()
	declare void @b()			declare void @b()
	declare void @c()			declare void @c()
	declare void @d()			declare void @d()
				declare void @e()
				declare void @f()

				!1 = !{!"branch_weights", i32 5, i32 3}
				!2 = !{!"branch_weights", i32 50, i32 50}
				davidxlUnsubmitted Done Reply Inline Actions change name. davidxl: change name.

test/CodeGen/SPARC/sjlj.ll

	Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
	; CHECK: st %i1, [%i0+4]			; CHECK: st %i1, [%i0+4]
	; CHECK: st %sp, [%i0+8]			; CHECK: st %sp, [%i0+8]
	; CHECK: bn .LBB1_2			; CHECK: bn .LBB1_2
	; CHECK: st %i7, [%i0+12]			; CHECK: st %i7, [%i0+12]
	; CHECK: ba .LBB1_1			; CHECK: ba .LBB1_1
	; CHECK: nop			; CHECK: nop
	; CHECK:.LBB1_1: ! %entry			; CHECK:.LBB1_1: ! %entry
	; CHECK: mov %g0, %i0			; CHECK: mov %g0, %i0
				; CHECK: ! %entry
	; CHECK: cmp %i0, 0			; CHECK: cmp %i0, 0
	; CHECK: bne .LBB1_4
	; CHECK: ba .LBB1_5
	; CHECK:.LBB1_2: ! Block address taken
	; CHECK: mov 1, %i0
	; CHECK: be .LBB1_5			; CHECK: be .LBB1_5
				; CHECK: nop
	; CHECK:.LBB1_4:			; CHECK:.LBB1_4:
				; CHECK: mov 1, %i0
	; CHECK: ba .LBB1_6			; CHECK: ba .LBB1_6
				; CHECK:.LBB1_2: ! Block address taken
				; CHECK: mov 1, %i0
				; CHECK: cmp %i0, 0
				; CHECK: bne .LBB1_4
				; CHECK: nop
	}			}
	declare i8* @llvm.frameaddress(i32) #2			declare i8* @llvm.frameaddress(i32) #2

	declare i8* @llvm.stacksave() #3			declare i8* @llvm.stacksave() #3

	declare i32 @llvm.eh.sjlj.setjmp(i8*) #3			declare i32 @llvm.eh.sjlj.setjmp(i8*) #3

	attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }			attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #1 = { noreturn nounwind }			attributes #1 = { noreturn nounwind }
	attributes #2 = { nounwind readnone }			attributes #2 = { nounwind readnone }
	attributes #3 = { nounwind }			attributes #3 = { nounwind }

test/CodeGen/WebAssembly/mem-intrinsics.ll

	; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0\| FileCheck %s			; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 \| FileCheck %s

	; Test memcpy, memmove, and memset intrinsics.			; Test memcpy, memmove, and memset intrinsics.

	target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"			target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
	target triple = "wasm32-unknown-unknown"			target triple = "wasm32-unknown-unknown"

	declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)			declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
	declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)			declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
	▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines

test/CodeGen/X86/block-placement.ll

Show First 20 Lines • Show All 308 Lines • ▼ Show 20 Lines

exit:		exit:
ret i32 %sum		ret i32 %sum
}		}

define void @unnatural_cfg1() {		define void @unnatural_cfg1() {
; Test that we can handle a loop with an inner unnatural loop at the end of		; Test that we can handle a loop with an inner unnatural loop at the end of
; a function. This is a gross CFG reduced out of the single source GCC.		; a function. This is a gross CFG reduced out of the single source GCC.
; CHECK: unnatural_cfg1		; CHECK-LABEL: unnatural_cfg1
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop.body1		; CHECK: %loop.body1
; CHECK: %loop.body2		; CHECK: %loop.body2
; CHECK: %loop.body3		; CHECK: %loop.body3

entry:		entry:
br label %loop.header		br label %loop.header

Show All 21 Lines	loop.body5:
%ptr2 = load i32, i32* undef, align 4		%ptr2 = load i32, i32* undef, align 4
br label %loop.body3		br label %loop.body3
}		}

define void @unnatural_cfg2() {		define void @unnatural_cfg2() {
; Test that we can handle a loop with a nested natural loop and an unnatural		; Test that we can handle a loop with a nested natural loop and an unnatural
; loop. This was reduced from a crash on block placement when run over		; loop. This was reduced from a crash on block placement when run over
; single-source GCC.		; single-source GCC.
; CHECK: unnatural_cfg2		; CHECK-LABEL: unnatural_cfg2
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop.body1		; CHECK: %loop.body1
; CHECK: %loop.body2		; CHECK: %loop.body2
; CHECK: %loop.body3
; CHECK: %loop.inner1.begin
; The end block is folded with %loop.body3...
; CHECK-NOT: %loop.inner1.end
; CHECK: %loop.body4		; CHECK: %loop.body4
; CHECK: %loop.inner2.begin		; CHECK: %loop.inner2.begin
; The loop.inner2.end block is folded		; CHECK: %loop.inner2.begin
		; CHECK: %loop.body3
		; CHECK: %loop.inner1.begin
; CHECK: %loop.header		; CHECK: %loop.header
; CHECK: %bail		; CHECK: %bail

entry:		entry:
br label %loop.header		br label %loop.header

loop.header:		loop.header:
%comp0 = icmp eq i32* undef, null		%comp0 = icmp eq i32* undef, null
▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Lines
declare i32 @__gxx_personality_v0(...)		declare i32 @__gxx_personality_v0(...)

define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {		define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; Some times the landing pad ends up as the first successor of an invoke block.		; Some times the landing pad ends up as the first successor of an invoke block.
; When this happens, a strange result used to fall out of updateTerminators: we		; When this happens, a strange result used to fall out of updateTerminators: we
; didn't correctly locate the fallthrough successor, assuming blindly that the		; didn't correctly locate the fallthrough successor, assuming blindly that the
; first one was the fallthrough successor. As a result, we would add an		; first one was the fallthrough successor. As a result, we would add an
; erroneous jump to the landing pad thinking that was the default successor.		; erroneous jump to the landing pad thinking that was the default successor.
; CHECK: test_eh_lpad_successor		; CHECK-LABEL: test_eh_lpad_successor
; CHECK: %entry		; CHECK: %entry
; CHECK-NOT: jmp		; CHECK-NOT: jmp
; CHECK: %loop		; CHECK: %loop

entry:		entry:
invoke i32 @f() to label %preheader unwind label %lpad		invoke i32 @f() to label %preheader unwind label %lpad

preheader:		preheader:
Show All 11 Lines
declare void @fake_throw() noreturn		declare void @fake_throw() noreturn

define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {		define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; For blocks containing a 'throw' (or similar functionality), we have		; For blocks containing a 'throw' (or similar functionality), we have
; a no-return invoke. In this case, only EH successors will exist, and		; a no-return invoke. In this case, only EH successors will exist, and
; fallthrough simply won't occur. Make sure we don't crash trying to update		; fallthrough simply won't occur. Make sure we don't crash trying to update
; terminators for such constructs.		; terminators for such constructs.
;		;
; CHECK: test_eh_throw		; CHECK-LABEL: test_eh_throw
; CHECK: %entry		; CHECK: %entry
; CHECK: %cleanup		; CHECK: %cleanup

entry:		entry:
invoke void @fake_throw() to label %continue unwind label %cleanup		invoke void @fake_throw() to label %continue unwind label %cleanup

continue:		continue:
unreachable		unreachable

cleanup:		cleanup:
%0 = landingpad { i8*, i32 }		%0 = landingpad { i8*, i32 }
cleanup		cleanup
unreachable		unreachable
}		}

define void @test_unnatural_cfg_backwards_inner_loop() {		define void @test_unnatural_cfg_backwards_inner_loop() {
; Test that when we encounter an unnatural CFG structure after having formed		; Test that when we encounter an unnatural CFG structure after having formed
; a chain for an inner loop which happened to be laid out backwards we don't		; a chain for an inner loop which happened to be laid out backwards we don't
; attempt to merge onto the wrong end of the inner loop just because we find it		; attempt to merge onto the wrong end of the inner loop just because we find it
; first. This was reduced from a crasher in GCC's single source.		; first. This was reduced from a crasher in GCC's single source.
;		;
; CHECK: test_unnatural_cfg_backwards_inner_loop		; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop2b		; CHECK: %loop2b
; CHECK: %loop1		; CHECK: %loop1

entry:		entry:
br i1 undef, label %loop2a, label %body		br i1 undef, label %loop2a, label %body

body:		body:
Show All 23 Lines

define void @unanalyzable_branch_to_loop_header() {		define void @unanalyzable_branch_to_loop_header() {
; Ensure that we can handle unanalyzable branches into loop headers. We		; Ensure that we can handle unanalyzable branches into loop headers. We
; pre-form chains for unanalyzable branches, and will find the tail end of that		; pre-form chains for unanalyzable branches, and will find the tail end of that
; at the start of the loop. This function uses floating point comparison		; at the start of the loop. This function uses floating point comparison
; fallthrough because that happens to always produce unanalyzable branches on		; fallthrough because that happens to always produce unanalyzable branches on
; x86.		; x86.
;		;
; CHECK: unanalyzable_branch_to_loop_header		; CHECK-LABEL: unanalyzable_branch_to_loop_header
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop		; CHECK: %loop
; CHECK: %exit		; CHECK: %exit

entry:		entry:
%cmp = fcmp une double 0.000000e+00, undef		%cmp = fcmp une double 0.000000e+00, undef
br i1 %cmp, label %loop, label %exit		br i1 %cmp, label %loop, label %exit

loop:		loop:
%cond = icmp eq i8 undef, 42		%cond = icmp eq i8 undef, 42
br i1 %cond, label %exit, label %loop		br i1 %cond, label %exit, label %loop

exit:		exit:
ret void		ret void
}		}

define void @unanalyzable_branch_to_best_succ(i1 %cond) {		define void @unanalyzable_branch_to_best_succ(i1 %cond) {
; Ensure that we can handle unanalyzable branches where the destination block		; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the optimal successor to merge.		; gets selected as the optimal successor to merge.
;		;
; This branch is now analyzable and hence the destination block becomes the		; This branch is now analyzable and hence the destination block becomes the
; hotter one. The right order is entry->bar->exit->foo.		; hotter one. The right order is entry->bar->exit->foo.
;		;
; CHECK: unanalyzable_branch_to_best_succ		; CHECK-LABEL: unanalyzable_branch_to_best_succ
; CHECK: %entry		; CHECK: %entry
; CHECK: %bar		; CHECK: %bar
; CHECK: %exit		; CHECK: %exit
; CHECK: %foo		; CHECK: %foo

entry:		entry:
; Bias this branch toward bar to ensure we form that chain.		; Bias this branch toward bar to ensure we form that chain.
br i1 %cond, label %bar, label %foo, !prof !1		br i1 %cond, label %bar, label %foo, !prof !1
Show All 9 Lines
exit:		exit:
ret void		ret void
}		}

define void @unanalyzable_branch_to_free_block(float %x) {		define void @unanalyzable_branch_to_free_block(float %x) {
; Ensure that we can handle unanalyzable branches where the destination block		; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the best free block in the CFG.		; gets selected as the best free block in the CFG.
;		;
; CHECK: unanalyzable_branch_to_free_block		; CHECK-LABEL: unanalyzable_branch_to_free_block
; CHECK: %entry		; CHECK: %entry
; CHECK: %a		; CHECK: %a
; CHECK: %b		; CHECK: %b
; CHECK: %c		; CHECK: %c
; CHECK: %exit		; CHECK: %exit

entry:		entry:
br i1 undef, label %a, label %b		br i1 undef, label %a, label %b
Show All 13 Lines
exit:		exit:
ret void		ret void
}		}

define void @many_unanalyzable_branches() {		define void @many_unanalyzable_branches() {
; Ensure that we don't crash as we're building up many unanalyzable branches,		; Ensure that we don't crash as we're building up many unanalyzable branches,
; blocks, and loops.		; blocks, and loops.
;		;
; CHECK: many_unanalyzable_branches		; CHECK-LABEL: many_unanalyzable_branches
; CHECK: %entry		; CHECK: %entry
; CHECK: %exit		; CHECK: %exit

entry:		entry:
br label %0		br label %0

%val0 = load volatile float, float* undef		%val0 = load volatile float, float* undef
%cmp0 = fcmp une float %val0, undef		%cmp0 = fcmp une float %val0, undef
▲ Show 20 Lines • Show All 202 Lines • ▼ Show 20 Lines
; 1) Loop rotation needs to ensure that the desired exiting edge can be		; 1) Loop rotation needs to ensure that the desired exiting edge can be
; a fallthrough.		; a fallthrough.
; 2) The exiting edge from the loop which is rotated to be laid out at the		; 2) The exiting edge from the loop which is rotated to be laid out at the
; bottom of the loop needs to be exiting into the nearest enclosing loop (to		; bottom of the loop needs to be exiting into the nearest enclosing loop (to
; which there is an exit). Otherwise, we force that enclosing loop into		; which there is an exit). Otherwise, we force that enclosing loop into
; strange layouts that are siginificantly less efficient, often times maing		; strange layouts that are siginificantly less efficient, often times maing
; it discontiguous.		; it discontiguous.
;		;
; CHECK: @benchmark_heapsort		; CHECK-LABEL: @benchmark_heapsort
; CHECK: %entry		; CHECK: %entry
; First rotated loop top.		; First rotated loop top.
; CHECK: .p2align		; CHECK: .p2align
; CHECK: %while.end		; CHECK: %while.end
; %for.cond gets completely tail-duplicated away.		; %for.cond gets completely tail-duplicated away.
; CHECK: %if.then		; CHECK: %if.then
; CHECK: %if.else		; CHECK: %if.else
; CHECK: %if.end10		; CHECK: %if.end10
▲ Show 20 Lines • Show All 384 Lines • ▼ Show 20 Lines
; falls below 80%		; falls below 80%
; The probability for both branches is 85%. For then2 vs else1		; The probability for both branches is 85%. For then2 vs else1
; this results in a compounded probability of 83%.		; this results in a compounded probability of 83%.
; Neither then2->fork1 nor then2->fork2 has a large enough relative		; Neither then2->fork1 nor then2->fork2 has a large enough relative
; probability to break the CFG.		; probability to break the CFG.
; Relative probs:		; Relative probs:
; then2 -> fork1 vs else1 -> fork1 = 71%		; then2 -> fork1 vs else1 -> fork1 = 71%
; then2 -> fork2 vs else2 -> fork2 = 74%		; then2 -> fork2 vs else2 -> fork2 = 74%
		; The lattice checking picks the edges:
		; then2 -> fork2 and else1 -> fork1
		; So only then2 -> fork2 should be chosen.
; CHECK-LABEL: test_forked_hot_diamond_gets_cold		; CHECK-LABEL: test_forked_hot_diamond_gets_cold
; CHECK: %entry		; CHECK: %entry
; CHECK: %then1		; CHECK: %then1
; CHECK: %then2		; CHECK: %then2
		; CHECK: %fork2
; CHECK: %else1		; CHECK: %else1
; CHECK: %fork1		; CHECK: %fork1
; CHECK: %else2
; CHECK: %fork2
; CHECK: %exit		; CHECK: %exit
		; CHECK: %else2
entry:		entry:
%gep1 = getelementptr i32, i32* %a, i32 1		%gep1 = getelementptr i32, i32* %a, i32 1
%val1 = load i32, i32* %gep1		%val1 = load i32, i32* %gep1
%cond1 = icmp ugt i32 %val1, 1		%cond1 = icmp ugt i32 %val1, 1
br i1 %cond1, label %then1, label %else1, !prof !9		br i1 %cond1, label %then1, label %else1, !prof !9

then1:		then1:
call void @hot_function()		call void @hot_function()
Show All 36 Lines
; fallthrough stays above 80%		; fallthrough stays above 80%
; (1:8) followed by (1:1) is still (1:4)		; (1:8) followed by (1:1) is still (1:4)
; Here we use 90% probability because two in a row		; Here we use 90% probability because two in a row
; have a 89 % probability vs the original branch.		; have a 89 % probability vs the original branch.
; CHECK-LABEL: test_forked_hot_diamond_stays_hot		; CHECK-LABEL: test_forked_hot_diamond_stays_hot
; CHECK: %entry		; CHECK: %entry
; CHECK: %then1		; CHECK: %then1
; CHECK: %then2		; CHECK: %then2
; CHECK: %fork1
; CHECK: %else1
; CHECK: %else2
; CHECK: %fork2		; CHECK: %fork2
		; CHECK: %else1
		; CHECK: %fork1
; CHECK: %exit		; CHECK: %exit
		; CHECK: %else2
entry:		entry:
%gep1 = getelementptr i32, i32* %a, i32 1		%gep1 = getelementptr i32, i32* %a, i32 1
%val1 = load i32, i32* %gep1		%val1 = load i32, i32* %gep1
%cond1 = icmp ugt i32 %val1, 1		%cond1 = icmp ugt i32 %val1, 1
br i1 %cond1, label %then1, label %else1, !prof !10		br i1 %cond1, label %then1, label %else1, !prof !10

then1:		then1:
call void @hot_function()		call void @hot_function()
Show All 39 Lines

test/CodeGen/X86/bypass-slow-division-32.ll

	Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: testl $-256, %edi			; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: je .LBB3_1			; CHECK-NEXT: je .LBB3_1
	; CHECK-NEXT: # BB#2:			; CHECK-NEXT: # BB#2:
	; CHECK-NEXT: movl %ecx, %eax			; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: cltd			; CHECK-NEXT: cltd
	; CHECK-NEXT: idivl %ebx			; CHECK-NEXT: idivl %ebx
	; CHECK-NEXT: movl %eax, %esi			; CHECK-NEXT: movl %eax, %esi
	; CHECK-NEXT: testl $-256, %edi			; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: jne .LBB3_5
	; CHECK-NEXT: jmp .LBB3_4
	; CHECK-NEXT: .LBB3_1:
	; CHECK-NEXT: movzbl %cl, %eax
	; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
	; CHECK-NEXT: divb %bl
	; CHECK-NEXT: movzbl %al, %esi
	; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: je .LBB3_4			; CHECK-NEXT: je .LBB3_4
	; CHECK-NEXT: .LBB3_5:			; CHECK-NEXT: .LBB3_5:
	; CHECK-NEXT: xorl %edx, %edx			; CHECK-NEXT: xorl %edx, %edx
	; CHECK-NEXT: movl %ecx, %eax			; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: divl %ebx			; CHECK-NEXT: divl %ebx
	; CHECK-NEXT: jmp .LBB3_6			; CHECK-NEXT: jmp .LBB3_6
				; CHECK-NEXT: .LBB3_1:
				; CHECK-NEXT: movzbl %cl, %eax
				; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
				; CHECK-NEXT: divb %bl
				; CHECK-NEXT: movzbl %al, %esi
				; CHECK-NEXT: testl $-256, %edi
				; CHECK-NEXT: jne .LBB3_5
	; CHECK-NEXT: .LBB3_4:			; CHECK-NEXT: .LBB3_4:
	; CHECK-NEXT: movzbl %cl, %eax			; CHECK-NEXT: movzbl %cl, %eax
	; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>			; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
	; CHECK-NEXT: divb %bl			; CHECK-NEXT: divb %bl
	; CHECK-NEXT: movzbl %al, %eax			; CHECK-NEXT: movzbl %al, %eax
	; CHECK-NEXT: .LBB3_6:			; CHECK-NEXT: .LBB3_6:
	; CHECK-NEXT: addl %eax, %esi			; CHECK-NEXT: addl %eax, %esi
	; CHECK-NEXT: movl %esi, %eax			; CHECK-NEXT: movl %esi, %eax
	▲ Show 20 Lines • Show All 121 Lines • Show Last 20 Lines

test/CodeGen/X86/sse1.ll

	Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines
	; X32: # BB#0: # %entry			; X32: # BB#0: # %entry
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: xorps %xmm0, %xmm0			; X32-NEXT: xorps %xmm0, %xmm0
	; X32-NEXT: je .LBB1_1			; X32-NEXT: je .LBB1_1
	; X32-NEXT: # BB#2: # %entry			; X32-NEXT: # BB#2: # %entry
	; X32-NEXT: xorps %xmm1, %xmm1			; X32-NEXT: xorps %xmm1, %xmm1
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: jne .LBB1_5			; X32-NEXT: jne .LBB1_5
	; X32-NEXT: jmp .LBB1_4			; X32-NEXT: .LBB1_4:
				; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
				; X32-NEXT: jne .LBB1_8
				; X32-NEXT: .LBB1_7:
				; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
				; X32-NEXT: jmp .LBB1_9
	; X32-NEXT: .LBB1_1:			; X32-NEXT: .LBB1_1:
	; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: je .LBB1_4			; X32-NEXT: je .LBB1_4
	; X32-NEXT: .LBB1_5: # %entry			; X32-NEXT: .LBB1_5: # %entry
	; X32-NEXT: xorps %xmm2, %xmm2			; X32-NEXT: xorps %xmm2, %xmm2
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: jne .LBB1_8
	; X32-NEXT: jmp .LBB1_7
	; X32-NEXT: .LBB1_4:
	; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: je .LBB1_7			; X32-NEXT: je .LBB1_7
	; X32-NEXT: .LBB1_8: # %entry			; X32-NEXT: .LBB1_8: # %entry
	; X32-NEXT: xorps %xmm3, %xmm3			; X32-NEXT: xorps %xmm3, %xmm3
	; X32-NEXT: jmp .LBB1_9
	; X32-NEXT: .LBB1_7:
	; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X32-NEXT: .LBB1_9: # %entry			; X32-NEXT: .LBB1_9: # %entry
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]			; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X32-NEXT: jne .LBB1_11			; X32-NEXT: jne .LBB1_11
	; X32-NEXT: # BB#10:			; X32-NEXT: # BB#10:
	; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-NEXT: .LBB1_11: # %entry			; X32-NEXT: .LBB1_11: # %entry
	; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]			; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: vselect:			; X64-LABEL: vselect:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: testl %ecx, %ecx			; X64-NEXT: testl %ecx, %ecx
	; X64-NEXT: xorps %xmm0, %xmm0			; X64-NEXT: xorps %xmm0, %xmm0
	; X64-NEXT: je .LBB1_1			; X64-NEXT: je .LBB1_1
	; X64-NEXT: # BB#2: # %entry			; X64-NEXT: # BB#2: # %entry
	; X64-NEXT: xorps %xmm1, %xmm1			; X64-NEXT: xorps %xmm1, %xmm1
	; X64-NEXT: testl %edx, %edx			; X64-NEXT: testl %edx, %edx
	; X64-NEXT: jne .LBB1_5			; X64-NEXT: jne .LBB1_5
	; X64-NEXT: jmp .LBB1_4			; X64-NEXT: .LBB1_4:
				; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X64-NEXT: testl %r8d, %r8d
				; X64-NEXT: jne .LBB1_8
				; X64-NEXT: .LBB1_7:
				; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
				; X64-NEXT: jmp .LBB1_9
	; X64-NEXT: .LBB1_1:			; X64-NEXT: .LBB1_1:
	; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; X64-NEXT: testl %edx, %edx			; X64-NEXT: testl %edx, %edx
	; X64-NEXT: je .LBB1_4			; X64-NEXT: je .LBB1_4
	; X64-NEXT: .LBB1_5: # %entry			; X64-NEXT: .LBB1_5: # %entry
	; X64-NEXT: xorps %xmm2, %xmm2			; X64-NEXT: xorps %xmm2, %xmm2
	; X64-NEXT: testl %r8d, %r8d			; X64-NEXT: testl %r8d, %r8d
	; X64-NEXT: jne .LBB1_8
	; X64-NEXT: jmp .LBB1_7
	; X64-NEXT: .LBB1_4:
	; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: testl %r8d, %r8d
	; X64-NEXT: je .LBB1_7			; X64-NEXT: je .LBB1_7
	; X64-NEXT: .LBB1_8: # %entry			; X64-NEXT: .LBB1_8: # %entry
	; X64-NEXT: xorps %xmm3, %xmm3			; X64-NEXT: xorps %xmm3, %xmm3
	; X64-NEXT: jmp .LBB1_9
	; X64-NEXT: .LBB1_7:
	; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB1_9: # %entry			; X64-NEXT: .LBB1_9: # %entry
	; X64-NEXT: testl %esi, %esi			; X64-NEXT: testl %esi, %esi
	; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]			; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X64-NEXT: jne .LBB1_11			; X64-NEXT: jne .LBB1_11
	; X64-NEXT: # BB#10:			; X64-NEXT: # BB#10:
	; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB1_11: # %entry			; X64-NEXT: .LBB1_11: # %entry
	; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines
	; X64-NEXT: movl %eax, (%rdi)			; X64-NEXT: movl %eax, (%rdi)
	; X64-NEXT: movq %rdi, %rax			; X64-NEXT: movq %rdi, %rax
	; X64-NEXT: retq			; X64-NEXT: retq
	%cmp = icmp eq <4 x i32> %x, %y			%cmp = icmp eq <4 x i32> %x, %y
	%zext = zext <4 x i1> %cmp to <4 x i32>			%zext = zext <4 x i1> %cmp to <4 x i32>
	ret <4 x i32> %zext			ret <4 x i32> %zext
	}			}

	; Fragile test warning - we need to induce the generation of a vselect			; Fragile test warning - we need to induce the generation of a vselect
	; post-legalization to cause the crash seen in:			; post-legalization to cause the crash seen in:
	; https://llvm.org/bugs/show_bug.cgi?id=31672			; https://llvm.org/bugs/show_bug.cgi?id=31672
	; Is there a way to do that without an unsafe/fast sqrt intrinsic call?			; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
	; Also, although the goal for adding this test is to prove that we			; Also, although the goal for adding this test is to prove that we
	; don't crash, I have no idea what this code is doing, so I'm keeping			; don't crash, I have no idea what this code is doing, so I'm keeping
	; the full codegen checks in case there's motivation to improve this.			; the full codegen checks in case there's motivation to improve this.

	define <2 x float> @PR31672() #0 {			define <2 x float> @PR31672() #0 {
	▲ Show 20 Lines • Show All 124 Lines • Show Last 20 Lines

test/CodeGen/X86/tail-dup-merge-loop-headers.ll

	; RUN: llc -O2 -o - %s \| FileCheck %s			; RUN: llc -O2 -o - %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: nounwind uwtable			; Function Attrs: nounwind uwtable
	; CHECK-LABEL: tail_dup_merge_loops			; CHECK-LABEL: tail_dup_merge_loops
	; CHECK: # %entry			; CHECK: # %entry
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
				; CHECK: # %exit
				; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_exit			; CHECK: # %inner_loop_exit
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_latch			; CHECK: # %inner_loop_latch
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_test			; CHECK: # %inner_loop_test
	; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %exit
	define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {			define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
	entry:			entry:
	%notlhs674.i = icmp eq i32 %a, 0			%notlhs674.i = icmp eq i32 %a, 0
	br label %outer_loop_top			br label %outer_loop_top

	outer_loop_top: ; preds = %inner_loop_exit, %entry			outer_loop_top: ; preds = %inner_loop_exit, %entry
	%dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]			%dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]
	br i1 %notlhs674.i, label %exit, label %inner_loop_preheader			br i1 %notlhs674.i, label %exit, label %inner_loop_preheader
	▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines

test/CodeGen/X86/tail-dup-repeat.ll

	; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s \| FileCheck %s			; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: uwtable			; Function Attrs: uwtable
	; When tail-duplicating during placement, we work backward from blocks with			; When tail-duplicating during placement, we work backward from blocks with
	; multiple successors. In this case, the block dup1 gets duplicated into dup2			; multiple successors. In this case, the block dup1 gets duplicated into dup2
	; and if.then64, and then the block dup2 gets duplicated into land.lhs.true			; and if.then64, and then the block dup2 gets duplicated into land.lhs.true
	; and if.end70			; and if.end70
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

test/CodeGen/X86/tail-opts.ll

	Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines
	; with only a branch in common, regardless of the fallthrough situation.			; with only a branch in common, regardless of the fallthrough situation.

	; CHECK-LABEL: dont_merge_oddly:			; CHECK-LABEL: dont_merge_oddly:
	; CHECK-NOT: ret			; CHECK-NOT: ret
	; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: jbe .LBB2_3			; CHECK-NEXT: jbe .LBB2_3
	; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: ja .LBB2_4			; CHECK-NEXT: ja .LBB2_4
	; CHECK-NEXT: jmp .LBB2_2			; CHECK-NEXT: .LBB2_2:
				; CHECK-NEXT: movb $1, %al
				; CHECK-NEXT: ret
	; CHECK-NEXT: .LBB2_3:			; CHECK-NEXT: .LBB2_3:
	; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: jbe .LBB2_2			; CHECK-NEXT: jbe .LBB2_2
	; CHECK-NEXT: .LBB2_4:			; CHECK-NEXT: .LBB2_4:
	; CHECK-NEXT: xorl %eax, %eax			; CHECK-NEXT: xorl %eax, %eax
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	; CHECK-NEXT: .LBB2_2:
	; CHECK-NEXT: movb $1, %al
	; CHECK-NEXT: ret

	define i1 @dont_merge_oddly(float* %result) nounwind {			define i1 @dont_merge_oddly(float* %result) nounwind {
	entry:			entry:
	%tmp4 = getelementptr float, float* %result, i32 2			%tmp4 = getelementptr float, float* %result, i32 2
	%tmp5 = load float, float* %tmp4, align 4			%tmp5 = load float, float* %tmp4, align 4
	%tmp7 = getelementptr float, float* %result, i32 4			%tmp7 = getelementptr float, float* %result, i32 4
	%tmp8 = load float, float* %tmp7, align 4			%tmp8 = load float, float* %tmp7, align 4
	%tmp10 = getelementptr float, float* %result, i32 6			%tmp10 = getelementptr float, float* %result, i32 6
	▲ Show 20 Lines • Show All 338 Lines • Show Last 20 Lines

test/CodeGen/X86/twoaddr-coalesce-3.ll

Show All 13 Lines	entry:
br i1 %cmp3, label %for.body.lr.ph, label %for.end		br i1 %cmp3, label %for.body.lr.ph, label %for.end

for.body.lr.ph: ; preds = %entry		for.body.lr.ph: ; preds = %entry
%total.promoted = load i32, i32* @total, align 4		%total.promoted = load i32, i32* @total, align 4
br label %for.body		br label %for.body

; Check that only one mov will be generated in the kernel loop.		; Check that only one mov will be generated in the kernel loop.
; CHECK-LABEL: foo:		; CHECK-LABEL: foo:
; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body		; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]		; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: shrl $31, [[REG1]]		; CHECK: shrl $31, [[REG1]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: jl [[LOOP1]]		; CHECK: jl [[LOOP1]]
for.body: ; preds = %for.body.lr.ph, %for.body		for.body: ; preds = %for.body.lr.ph, %for.body
%add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]		%add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
Show All 20 Lines	entry:
br i1 %cmp3, label %for.body.lr.ph, label %for.end		br i1 %cmp3, label %for.body.lr.ph, label %for.end

for.body.lr.ph: ; preds = %entry		for.body.lr.ph: ; preds = %entry
%total.promoted = load i32, i32* @total, align 4		%total.promoted = load i32, i32* @total, align 4
br label %for.body		br label %for.body

; Check that only two mov will be generated in the kernel loop.		; Check that only two mov will be generated in the kernel loop.
; CHECK-LABEL: goo:		; CHECK-LABEL: goo:
; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body		; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]		; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: shrl $31, [[REG2]]		; CHECK: shrl $31, [[REG2]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}		; CHECK: movl {{.*}}
; CHECK: jl [[LOOP2]]		; CHECK: jl [[LOOP2]]
for.body: ; preds = %for.body.lr.ph, %for.body		for.body: ; preds = %for.body.lr.ph, %for.body
Show All 17 Lines

test/CodeGen/X86/win-alloca-expander.ll

Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	; CHECK: calll __chkstk
call void @f(%struct.S* %p0)		call void @f(%struct.S* %p0)
ret void		ret void
}		}

define void @cfg(i1 %x, i1 %y) {		define void @cfg(i1 %x, i1 %y) {
; Test that the blocks are analyzed in the correct order.		; Test that the blocks are analyzed in the correct order.
; CHECK-LABEL: cfg:		; CHECK-LABEL: cfg:
entry:		entry:
br i1 %x, label %bb1, label %bb2		br i1 %x, label %bb1, label %bb3

bb1:		bb1:
%p1 = alloca %struct.S		%p1 = alloca %struct.S
; CHECK: pushl %eax		; CHECK: pushl %eax
; CHECK: subl $1020, %esp		; CHECK: subl $1020, %esp
br label %bb3		br label %bb4

bb2:		bb2:
%p2 = alloca %struct.T		%p5 = alloca %struct.T
; CHECK: pushl %eax		; CHECK: pushl %eax
; CHECK: subl $2996, %esp		; CHECK: subl $2996, %esp
br label %bb3		call void @g(%struct.T* %p5)
		ret void

bb3:		bb3:
br i1 %y, label %bb4, label %bb5		%p2 = alloca %struct.T
		; CHECK: pushl %eax
		; CHECK: subl $2996, %esp
		br label %bb4

bb4:		bb4:
		br i1 %y, label %bb5, label %bb2

		bb5:
%p4 = alloca %struct.S		%p4 = alloca %struct.S
; CHECK: subl $1024, %esp		; CHECK: subl $1024, %esp
call void @f(%struct.S* %p4)		call void @f(%struct.S* %p4)
ret void		ret void

bb5:
%p5 = alloca %struct.T
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
call void @g(%struct.T* %p5)
ret void
}		}


declare void @f(%struct.S*)		declare void @f(%struct.S*)
declare void @g(%struct.T*)		declare void @g(%struct.T*)
declare void @h(%struct.U*)		declare void @h(%struct.U*)

declare i8* @llvm.stacksave()		declare i8* @llvm.stacksave()
declare void @llvm.stackrestore(i8*)		declare void @llvm.stackrestore(i8*)

This is an archive of the discontinued LLVM Phabricator instance.

Codegen: Make chains from trellis-shaped CFGsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 87333

lib/CodeGen/MachineBlockPlacement.cpp

test/CodeGen/AArch64/branch-relax-cbz.ll

test/CodeGen/AArch64/combine-comparisons-by-cse.ll

test/CodeGen/AArch64/optimize-cond-branch.ll

test/CodeGen/AMDGPU/basic-branch.ll

test/CodeGen/AMDGPU/branch-relaxation.ll

test/CodeGen/AMDGPU/cf-loop-on-constant.ll

test/CodeGen/AMDGPU/convergent-inlineasm.ll

test/CodeGen/AMDGPU/salu-to-valu.ll

test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

test/CodeGen/ARM/atomic-cmpxchg.ll

test/CodeGen/ARM/fold-stack-adjust.ll

test/CodeGen/PowerPC/tail-dup-break-cfg.ll

test/CodeGen/PowerPC/tail-dup-layout.ll

test/CodeGen/SPARC/sjlj.ll

test/CodeGen/WebAssembly/mem-intrinsics.ll

test/CodeGen/X86/block-placement.ll

test/CodeGen/X86/bypass-slow-division-32.ll

test/CodeGen/X86/sse1.ll

test/CodeGen/X86/tail-dup-merge-loop-headers.ll

test/CodeGen/X86/tail-dup-repeat.ll

test/CodeGen/X86/tail-opts.ll

test/CodeGen/X86/twoaddr-coalesce-3.ll

test/CodeGen/X86/win-alloca-expander.ll

Codegen: Make chains from trellis-shaped CFGs
ClosedPublic