This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/CodeGen/
-
CodeGen/
-
MachineBlockPlacement.cpp
-
test/CodeGen/
-
CodeGen/
-
AArch64/
-
branch-relax-cbz.ll
-
combine-comparisons-by-cse.ll
-
optimize-cond-branch.ll
-
AMDGPU/
-
basic-branch.ll
-
branch-relaxation.ll
-
cf-loop-on-constant.ll
-
convergent-inlineasm.ll
-
salu-to-valu.ll
-
ARM/
-
2007-05-22-tailmerge-3.ll
-
atomic-cmpxchg.ll
-
fold-stack-adjust.ll
-
PowerPC/
-
tail-dup-break-cfg.ll
-
tail-dup-layout.ll
-
SPARC/
-
sjlj.ll
-
WebAssembly/
-
mem-intrinsics.ll
-
X86/
-
block-placement.ll
-
bypass-slow-division-32.ll
-
sse1.ll
-
tail-dup-merge-loop-headers.ll
-
tail-dup-repeat.ll
-
tail-opts.ll
-
twoaddr-coalesce-3.ll
-
win-alloca-expander.ll

Differential D28522

Codegen: Make chains from trellis-shaped CFGs
ClosedPublic

Authored by iteratee on Jan 10 2017, 11:39 AM.

Download Raw Diff

Details

Reviewers

davidxl
• tstellarAMD
jlebar
javed.absar

Commits

rG7fbec9bdf1b7: Codegen: Make chains from trellis-shaped CFGs
rL295223: Codegen: Make chains from trellis-shaped CFGs

Summary

Lay out trellis-shaped CFGs optimally.
A trellis of the shape below:

A     B
|\   /|
| \ / |
|  X  |
| / \ |
|/   \|
C     D

would be laid out A; B->C ; D by the current layout algorithm. Now we identify
trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an
increasing number of predecessors. A trellis is a a group of 2 or more
predecessor blocks that all have the same successors.

because of this we can tail duplicate to extend existing trellises.

As an example consider the following CFG:

  B   D   F   H
 / \ / \ / \ / \
A---C---E---G---Ret

Where A,C,E,G are all small (Currently 2 instructions).

The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret.

The current code will copy C into B, E into D and G into F and yield the layout
A,C,B(C),E,D(E),F(G),G,H,ret

define void @straight_test(i32 %tag) {
entry:
  br label %test1
test1: ; A
  %tagbit1 = and i32 %tag, 1
  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
  br i1 %tagbit1eq0, label %test2, label %optional1
optional1: ; B
  call void @a()
  br label %test2
test2: ; C
  %tagbit2 = and i32 %tag, 2
  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
  br i1 %tagbit2eq0, label %test3, label %optional2
optional2: ; D
  call void @b()
  br label %test3
test3: ; E
  %tagbit3 = and i32 %tag, 4
  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
  br i1 %tagbit3eq0, label %test4, label %optional3
optional3: ; F
  call void @c()
  br label %test4
test4: ; G
  %tagbit4 = and i32 %tag, 8
  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
  br i1 %tagbit4eq0, label %exit, label %optional4
optional4: ; H
  call void @d()
  br label %exit
exit:
  ret void
}

here is the layout after D27742:

straight_test:                          # @straight_test
; ... Prologue elided
; BB#0:                                 # %entry ; A (merged with test1)
; ... More prologue elided
        mr 30, 3
        andi. 3, 30, 1
        bc 12, 1, .LBB0_2
; BB#1:                                 # %test2 ; C
        rlwinm. 3, 30, 0, 30, 30
        beq      0, .LBB0_3
        b .LBB0_4
.LBB0_2:                                # %optional1 ; B (copy of C)
        bl a
        nop
        rlwinm. 3, 30, 0, 30, 30
        bne      0, .LBB0_4
.LBB0_3:                                # %test3 ; E
        rlwinm. 3, 30, 0, 29, 29
        beq      0, .LBB0_5
        b .LBB0_6
.LBB0_4:                                # %optional2 ; D (copy of E)
        bl b
        nop
        rlwinm. 3, 30, 0, 29, 29
        bne      0, .LBB0_6
.LBB0_5:                                # %test4 ; G
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
        b .LBB0_7
.LBB0_6:                                # %optional3 ; F (copy of G)
        bl c
        nop
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
        bl d
        nop
.LBB0_8:                                # %exit ; Ret
        ld 30, 96(1)                    # 8-byte Folded Reload
        addi 1, 1, 112
        ld 0, 16(1)
        mtlr 0
        blr

The tail-duplication has produced some benefit, but it has also produced a
trellis which is not laid out optimally. With this patch, we improve the layouts
of such trellises, and decrease the cost calculation for tail-duplication
accordingly.

This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have
back edges, which is a negative, but it has a bigger compensating
positive, which is that it handles the case where there are long strings
of skipped blocks much better than the original layout. Both layouts
handle runs of executed blocks equally well. Branch prediction also
improves if there is any correlation between subsequent optional blocks.

Here is the resulting concrete layout:

straight_test:                          # @straight_test
; BB#0:                                 # %entry ; A (merged with test1)
        mr 30, 3
        andi. 3, 30, 1
        bc 12, 1, .LBB0_4
; BB#1:                                 # %test2 ; C
        rlwinm. 3, 30, 0, 30, 30
        bne      0, .LBB0_5
.LBB0_2:                                # %test3 ; E
        rlwinm. 3, 30, 0, 29, 29
        bne      0, .LBB0_6
.LBB0_3:                                # %test4 ; G
        rlwinm. 3, 30, 0, 28, 28
        bne      0, .LBB0_7
        b .LBB0_8
.LBB0_4:                                # %optional1 ; B (Copy of C)
        bl a
        nop
        rlwinm. 3, 30, 0, 30, 30
        beq      0, .LBB0_2
.LBB0_5:                                # %optional2 ; D (Copy of E)
        bl b
        nop
        rlwinm. 3, 30, 0, 29, 29
        beq      0, .LBB0_3
.LBB0_6:                                # %optional3 ; F (Copy of G)
        bl c
        nop
        rlwinm. 3, 30, 0, 28, 28
        beq      0, .LBB0_8
.LBB0_7:                                # %optional4 ; H
        bl d
        nop
.LBB0_8:                                # %exit

Diff Detail

Repository: rL LLVM

Event Timeline

iteratee updated this revision to Diff 83836.Jan 10 2017, 11:39 AM

iteratee retitled this revision from to Codegen: Make chains from lattice-shaped CFGs.

iteratee updated this object.

iteratee added a reviewer: davidxl.

iteratee set the repository for this revision to rL LLVM.

iteratee added subscribers: arsenm, wdng, nhaehnle and 12 others.

Herald added a reviewer: • tstellarAMD. · View Herald TranscriptJan 10 2017, 11:39 AM

iteratee updated this object.Jan 10 2017, 11:40 AM

iteratee edited edge metadata.

iteratee added a parent revision: D27742: CodeGen: Allow small copyable blocks to "break" the CFG..

davidxl added inline comments.Jan 10 2017, 1:27 PM

lib/CodeGen/MachineBlockPlacement.cpp
624 ↗	(On Diff #83836)	Is it better to relax the condition such that as long as Succ can be dup'd into one of the unplaced predecessors, return true?
630 ↗	(On Diff #83836)	Change the second 'C' to 'C (+ BB') ' where BB' is the dup of BB
633 ↗	(On Diff #83836)	Better change 'B' to 'BB' to match the function argument name
639 ↗	(On Diff #83836)	Change E to 'Succ'

Comment update as requested.

Herald edited edge metadata. · View Herald TranscriptJan 10 2017, 2:49 PM

Herald added a subscriber: dschuff. · View Herald Transcript

iteratee added inline comments.Jan 10 2017, 2:50 PM

lib/CodeGen/MachineBlockPlacement.cpp
624 ↗	(On Diff #83836)	That isn't sufficient, because we need layout to be repeatable. So when we encounter the lattice the second time, there will be no more copies left to make. Chandler also wanted to allow another pass besides layout to do the duplication (perhaps with a larger threshold or different heuristic).

I am not sure I understand your reply about 'repeatability'. Can you elaborate? The suggestion is that if 'Succ' can be dup'ed into 'D' which is unplaced, return true. It basically does the same as skipping 'C'. Besides, checking the successors is not the reliable way to determine if C is block with duplicated bb.

Add comments elaborating on why we use a CFG check rather than checking for partial copiability of blocks or keeping a list of blocks with copies.

Herald edited edge metadata. · View Herald TranscriptJan 10 2017, 3:27 PM

In D28522#641977, @davidxl wrote:

I am not sure I understand your reply about 'repeatability'. Can you elaborate? The suggestion is that if 'Succ' can be dup'ed into 'D' which is unplaced, return true. It basically does the same as skipping 'C'. Besides, checking the successors is not the reliable way to determine if C is block with duplicated bb.

Discussed offline and comments updated to reflect discussion.

iteratee mentioned this in D28583: CodeGen: Allow small copyable blocks to "break" the CFG..Jan 11 2017, 4:34 PM

This patch now adjusts the probability accounting in D28583 to account for lattice layout. This means that more duplications occur. Tests that were in D28583 are now here.

Herald edited edge metadata. · View Herald TranscriptJan 12 2017, 12:03 PM

iteratee edited parent revisions, added: D28583: CodeGen: Allow small copyable blocks to "break" the CFG.; removed: D27742: CodeGen: Allow small copyable blocks to "break" the CFG..Jan 12 2017, 12:04 PM

mzolotukhin removed a subscriber: mzolotukhin.Jan 12 2017, 2:43 PM

Rebased and modified the probability calculations to account for lattices.

Herald added a reviewer: javed.absar. · View Herald TranscriptJan 19 2017, 6:29 PM

Rebasing.

Added lattice check outside of the tail-duplication code.
Updated tests to match.
Added test for non tail-dup lattice.

Is this patch up to date?

Looking over this, It feels like I can split the lattice portion out and put it in first. Even together they aren't very big, so If I can get some initial opinions on this before I start that, it would be appreciated.

I wanted to lay out lattice-type CFG's correctly even if the blocks are larger than what we would tail-duplicate.
To do that I worked out that for a lattice, we can compute which pair of edges forms the optimal fallthrough and use those edges. We don't really have to worry about CFG breaking, because with a lattice we can easily compute the optimal fallthrough pair and take it.

Lattice:
A Set of Predecessor blocks P that all have the same Successors S when |S| >= 2, |P| >=2 and S ∩ P = ∅
We can treat this as a graph optimization problem. There's a well known general algorithm, (the hungarian algorithm) but I just solved it for size 2 because that's the only size we really care about.

Thanks for taking a look, I'll try and get the plain lattice code separated from the tail duplication code tomorrow.

In D28522#665583, @iteratee wrote:

Looking over this, It feels like I can split the lattice portion out and put it in first. Even together they aren't very big, so If I can get some initial opinions on this before I start that, it would be appreciated.

I thought about this, and looked at what would go into the 2 patches. I don't think it's worth it now.

I updated the comments to make it more clear that we're doing this for general lattices and tail-duplication gets the benefit as well.

davidxl added inline comments.Feb 6 2017, 11:17 AM

lib/CodeGen/MachineBlockPlacement.cpp
760 ↗	(On Diff #87043)	merge error here -- two returns
833 ↗	(On Diff #87043)	tail duplication can create this pattern - why is it skipped? ` `BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / / S1 <-+` `
836 ↗	(On Diff #87043)	merge this with the insert before?
850 ↗	(On Diff #87043)	Have a high level description of the selection algo here as comments.
1220 ↗	(On Diff #87043)	unrelated change?
1314 ↗	(On Diff #87043)	unrelated change?

Handle Lattices with inter-successor edges.

More comments.

iteratee marked 4 inline comments as done.Feb 6 2017, 4:29 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
836 ↗	(On Diff #87043)	It can't, we have to count those predecessors, we just don't have to check them again.

davidxl added inline comments.Feb 7 2017, 11:02 AM

lib/CodeGen/MachineBlockPlacement.cpp
836 ↗	(On Diff #87043)	Can you move insert call down after ++PreCount. If insertion returns false, continue.
817 ↗	(On Diff #87333)	Since lattice with triangle is handled, the definition of lattice here is no longer accurate -- some predecessor does not have the same successors as BB.
847 ↗	(On Diff #87333)	how does this handle triangle case? In the example, S2 is predecessor of S1, but does not have the same successors of BB.
880 ↗	(On Diff #87333)	It is better to split the BestEdges into two BestIncomingEdges vector one for each viable successors and then sort them (or BestOutgoingEdges vectors one for each precessor). This makes the following code much more readable.
901 ↗	(On Diff #87333)	don't conflict (aka sharing the same successor)
912 ↗	(On Diff #87333)	If BestEdges are split, there is no need to do linear search for the best incoming edge for each successor -- after sorting, they are already accessible. Split BestEdges according to Predecessor is fine too. Either way, it is easy to detect conflict.
1296 ↗	(On Diff #87333)	Extract the special handling of lattice code into a helper to make the main flow of the caller cleaner.

Add back missing statement to handle triangles lost in merge.
Other tidying.

iteratee marked 4 inline comments as done.Feb 7 2017, 1:31 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
847 ↗	(On Diff #87333)	There was a merge mistake where it disappeared, sorry. I put it back.
880 ↗	(On Diff #87333)	Good catch, that is simpler.
1296 ↗	(On Diff #87333)	It already is in 2 helpers. Ignoring the logging the logic is: if (isLattice) return getBestLattice(); I don't see how a helper would be useful in this case.

davidxl added inline comments.Feb 7 2017, 1:48 PM

lib/CodeGen/MachineBlockPlacement.cpp
1296 ↗	(On Diff #87333)	I suggest pushing isLattice check into getBestLatticeSuccessor call. Also pushing the debug tracking code there too. The caller will be like if (LatticeSucc = getBestLatticeSuccessor(...)) { return BlockAndTailDupResult(LatticeSucc, false}' }
1292 ↗	(On Diff #87511)	Is this null? can you explain this code?

Move logging so that control flow is more obvious.

iteratee added inline comments.Feb 7 2017, 2:52 PM

lib/CodeGen/MachineBlockPlacement.cpp
1296 ↗	(On Diff #87333)	Keeping them separate makes the early return logic simpler, but I moved the debugging code.
1292 ↗	(On Diff #87511)	Yes. If BB is part of a lattice, but not an optimal edge, then we return early. We've already determined that all of BB's successors have a better fallthrough predecessor.

davidxl added inline comments.Feb 7 2017, 3:52 PM

lib/CodeGen/MachineBlockPlacement.cpp
920 ↗	(On Diff #87524)	The assert seems redundant - lattice shape check already checks number of predecessors.
924 ↗	(On Diff #87524)	Since the non-lattice based layout algorithm looks at cfg edges in forward direction (i.e. look at successor edges), it looks wrong to use best incoming edges to detect conflict. The conflict exists when the best outgoing edges from two predecessor share the same successor. Example: (skip to the end of this example to see general algorithm). BB Pred \| \ /\| \| \ / \| \| /\ \| \| / S2 \| / S1 If best outgoing edges of BB is BB->S1, while the best outgoing edges of Pred is Pred->S2, then there is no conflict. 1.1 If there is no triangle (from S2->S1), then the best successor for BB here is S1. 1.2 If there is an edge from S2 to S1 (forming triangle), if Freq(S2->S1) > Freq(BB->S1), then we prefer layout Pred->S2->S1, so BB's best successor should be null. If the best outgoing edge of BB and Pred is the same say S1, then there is a conflict. 2.1. no triangle: if Freq(BB->S1) > Freq(Pred->S1), return S1 as the best successor for BB, otherwise S2 2.2. there is an edge from S2 to S1: .... Actually if you look at all the special cases, there is a more general algorithm to get the optimal solution for all shapes: among all 4 edges (5 edges for the triangular case), find two non conflicting edges as the fallthrough edges such that their frequency sum is maximal. (two edges are conflicting if they share either source or dest BB).

If the lattice contains a triangle, we may want to tail duplicate instead. Check for that.

iteratee updated this revision to Diff 87569.Feb 7 2017, 5:26 PM

iteratee set the repository for this revision to rL LLVM.

iteratee added a reviewer: jlebar.

davidxl added inline comments.Feb 8 2017, 12:17 PM

lib/CodeGen/MachineBlockPlacement.cpp
899 ↗	(On Diff #87569)	Using tuple does not increase readability (e.g. mapping get<0> ... to actual field). It is better to just use a struct.
917 ↗	(On Diff #87569)	To greatly increase readability, please put code between line 917 and line 934 into a helper function: getBestFallEdgesInLattice(..) with comment like: // Find two non-conflicting edges with maximal total frequency in the lattice to be used as fall through.

davidxl added inline comments.Feb 8 2017, 12:24 PM

lib/CodeGen/MachineBlockPlacement.cpp
936 ↗	(On Diff #87569)	Do early return if condition is not true to reduce nesting level.
940 ↗	(On Diff #87569)	The code will be cleaner if early return is done here too and let the tail merging checking follow.
962 ↗	(On Diff #87569)	Why do you need to do chain merging here? The method is supposed to do analysis only.

iteratee updated this revision to Diff 87725.Feb 8 2017, 3:35 PM

iteratee marked 4 inline comments as done.Feb 8 2017, 3:40 PM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
940 ↗	(On Diff #87569)	Tail duplication is the early return case.
962 ↗	(On Diff #87569)	Well, I'm open to suggestions, but if we don't merge the triangle edge, it doesn't get chosen. We know it's optimal right now. Should we have a src->dest map that saves the other side of lattices and then follow it when we are trying to choose the successor for src? That would be cleaner.

Kyle asked me to look over this from an outsider's perspective and check for understandability. But after starting on it, I think this would be easier after David's comments have been addressed. I just have a few comments for now.

lib/CodeGen/MachineBlockPlacement.cpp
821 ↗	(On Diff #87725)	One thing that's confusing to me is: What exactly is "in" the lattice? We say that BB is part of the lattice if its successors form a lattice. So it sounds like a recursive definition. But then the criteria for whether BB's successors "form a lattice" is different from the criteria for whether BB is itself in a lattice. The other thing I have no intuition for here is, why do we use the mathematical word "lattice" for this shape? I'm sure there's a good reason, and understanding that might help in general.
1000 ↗	(On Diff #87725)	Can we run clang-format over this patch? git-clang-format, included in the clang sources, can run it just over your changes, so you don't have to reformat the whole file. For instance, this line appears to fit in 80 chars, so doesn't need to be wrapped. Also we usually put "&&" at the end of the line, not the beginning. And there are some lines that appear to be longer than 80 chars. If it helps, I have a script that runs git-clang-format on every arc diff so I don't have to remember to do it myself. https://github.com/jlebar/conf/blob/master/bin/arc
1308 ↗	(On Diff #87725)	Nit, we usually omit these braces, even though the if-body is multiline.

Patch actually updated to match comments from last time.

davidxl added inline comments.Feb 12 2017, 10:43 PM

lib/CodeGen/MachineBlockPlacement.cpp
962 ↗	(On Diff #87569)	ideally this should not happen. If it happens, it means the lattice based cost analysis is not consistent with current heuristic of determine better layout predecessor (which is likely the case). We should probably enhance that logic in the future?
1055 ↗	(On Diff #88033)	This comment does not seem to be correct. D can not be C's fall through. The lattice analysis has decided that D is the fall-through of BB. You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor.

only git-clang-format

iteratee marked an inline comment as done.Feb 13 2017, 11:07 AM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
962 ↗	(On Diff #87569)	I don't agree. We are conservative on purpose in the normal layout. When we find that we have a lattice, the need to be conservative shrinks. I don't expect them to get the same answer. Even if we did, I think it would be wasteful to re-compute the answer.
821 ↗	(On Diff #87725)	I didn't have a good word for it. I also can't find one googling. I'm open to suggestions. Lattice matched my initial intuitions, but really it's a subgraph where all maximal linear matchings are the same size.
1055 ↗	(On Diff #88033)	I reread it. It's correct. D should be the fallthrough successor of (C+BB). You won't need to check all Preds of Succ either -- only Preds that are potential layout predecessor of Succ. In this case, C can not be Succ's layout predecessor. I'm not sure what you mean by the above. The comment already says "unplaced predecessors"

Improved a comment, removed braces.

davidxl added inline comments.Feb 13 2017, 12:06 PM

lib/CodeGen/MachineBlockPlacement.cpp
962 ↗	(On Diff #87569)	It is not ideal, but my point still holds -- it is not a good idea to embed chain transformation code inside analysis code. If you want to improve the situation so that the analysis result can be better maintained -- that is fine -- but probably as a diferent patch.
1055 ↗	(On Diff #88033)	For the lattice including BB, C+BB, Succ and D, only when {BestA, BestB} == { BB->D, D->Succ}, will the tail duplication check is called. Does it mean D can not be C+BB's successor? What I meant is you can pass 'D' as a parameter to this call and only check if Succ can be tail dup into D.

iteratee added inline comments.Feb 13 2017, 12:53 PM

lib/CodeGen/MachineBlockPlacement.cpp
962 ↗	(On Diff #87569)	Then I'll do it the way I suggested. We already did the analysis for the other side of the lattice, we should save it, and then trust it when we get there.
1055 ↗	(On Diff #88033)	Yes, but if we tail-duplicate then C+BB has D as a fallthrough. That's why we can ignore it.

Save the analysis for the other side of the trellis so that we don't recompute it later.

Rename lattice to trellis, because it better matches existing usage.

With some changes this looks good for the part I was asked to review.

lib/CodeGen/MachineBlockPlacement.cpp
832 ↗	(On Diff #88254)	a trellis
833 ↗	(On Diff #88254)	trellises
851 ↗	(On Diff #88254)	Nit, auto*, or even just write out the type? "for (auto foo : bar)" is scary because it looks like you might be copying a nontrivially-sized object.
859 ↗	(On Diff #88254)	Do you want to avoid the double map lookup on SuccPred here? This function looks hot.
895 ↗	(On Diff #88254)	It looks like you don't actually care about anything other than the first two elements of this stable_sort? Last time I checked, std::stable_sort was relatively slow, and prone to allocate heap memory. If this is hot, you may want to avoid std::stable_sort. std::nth_element will sort of do what you want, except it doesn't seem to be stable. You may do better with a custom "GetTopTwo" function. Or maybe that's a premature optimization. :)
919 ↗	(On Diff #88254)	trellises? Now I'm not sure if this a typo or not, but I can't find "trelliss" as the plural in any dictionary I have onhand.
935 ↗	(On Diff #88254)	Nit, I'd move this down to under the comment that says "Collect the edge frequencies". Especially since the "2" in the constructor only makes sense after the if statement below.
940 ↗	(On Diff #88254)	Maybe "and a trellis of that size is basically unheard of"?
963 ↗	(On Diff #88254)	Any reason you don't want to return a SmallVector from getBestNonConflictingEdges? Or even an std::pair? Then you could say WeightedEdge BestA, BestB; std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges); which seems closer to what you mean.
982 ↗	(On Diff #88254)	Personally I'd prefer not to use "auto" here. MachineBasicBlock* isn't too much to type out, and otherwise there's no obvious, nearby anchor for the reader as to what's going on.
1334 ↗	(On Diff #88254)	Nit, capital letter
1341 ↗	(On Diff #88254)	This is unrelated to your patch, so you don't need to change it or anything, but if we always check `!BlockFilter \|\| BlockFilter->count(Foo)`, then shouldn't we just pass BlockFilter by reference and initialize it to an empty map when necessary? This would be much more ergonomic.
1348 ↗	(On Diff #88254)	Capital "i", lower-case "U", period.

This revision is now accepted and ready to land.Feb 13 2017, 3:05 PM

iteratee updated this revision to Diff 88277.Feb 13 2017, 4:49 PM

iteratee marked 7 inline comments as done.

davidxl added inline comments.Feb 14 2017, 3:19 PM

test/CodeGen/PowerPC/tail-dup-layout.ll
204 ↗	(On Diff #88277)	The term 'unavoidable' is not well defined -- why is 'then2' unavoidable?
254 ↗	(On Diff #88277)	Is this comment relevant here?
267 ↗	(On Diff #88277)	change name to trellis_test
275 ↗	(On Diff #88277)	Using non-equal branch probability here to make the result more obvious in different scenarios: there are conflict in best incoming edges there are no conflict etc.
305 ↗	(On Diff #88277)	We probably also need a test for trellis+triangle shape (without taildup)
323 ↗	(On Diff #88277)	change name.

Improved test coverage in response to comments.

iteratee marked an inline comment as done.Feb 15 2017, 10:39 AM

iteratee added inline comments.

lib/CodeGen/MachineBlockPlacement.cpp
895 ↗	(On Diff #88254)	I don't expect the lists to be large in practice, so I'll just use stable_sort for now. It's simple enough to revisit if it becomes a bottleneck.
919 ↗	(On Diff #88254)	I messed up find and replace. It's fixed now.
test/CodeGen/PowerPC/tail-dup-layout.ll
254 ↗	(On Diff #88277)	Yes.
275 ↗	(On Diff #88277)	The test is now larger. It handles conflicting incoming edges, and a couple of non-conflicting edges, and a triangle.
305 ↗	(On Diff #88277)	f->ret is a triangle edge. I'll make the test bigger with non-balanced edges to cover more scenarios.

lgtm

Closed by commit rL295223: Codegen: Make chains from trellis-shaped CFGs (authored by iteratee). · Explain WhyFeb 15 2017, 12:00 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

MachineBlockPlacement.cpp

310 lines

test/

CodeGen/

AArch64/

branch-relax-cbz.ll

13 lines

combine-comparisons-by-cse.ll

2 lines

optimize-cond-branch.ll

2 lines

AMDGPU/

basic-branch.ll

5 lines

branch-relaxation.ll

3 lines

cf-loop-on-constant.ll

2 lines

convergent-inlineasm.ll

1 line

salu-to-valu.ll

2 lines

ARM/

2007-05-22-tailmerge-3.ll

8 lines

atomic-cmpxchg.ll

8 lines

fold-stack-adjust.ll

2 lines

PowerPC/

tail-dup-break-cfg.ll

14 lines

tail-dup-layout.ll

378 lines

SPARC/

sjlj.ll

14 lines

WebAssembly/

mem-intrinsics.ll

2 lines

X86/

block-placement.ll

28 lines

bypass-slow-division-32.ll

15 lines

sse1.ll

34 lines

tail-dup-merge-loop-headers.ll

4 lines

tail-dup-repeat.ll

2 lines

tail-opts.ll

7 lines

twoaddr-coalesce-3.ll

4 lines

win-alloca-expander.ll

24 lines

Diff 88586

llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp

Show First 20 Lines • Show All 288 Lines • ▼ Show 20 Lines

namespace {		namespace {
class MachineBlockPlacement : public MachineFunctionPass {		class MachineBlockPlacement : public MachineFunctionPass {
/// \brief A typedef for a block filter set.		/// \brief A typedef for a block filter set.
typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;		typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;

/// Pair struct containing basic block and taildup profitiability		/// Pair struct containing basic block and taildup profitiability
struct BlockAndTailDupResult {		struct BlockAndTailDupResult {
MachineBasicBlock * BB;		MachineBasicBlock *BB;
bool ShouldTailDup;		bool ShouldTailDup;
};		};

		/// Triple struct containing edge weight and the edge.
		struct WeightedEdge {
		BlockFrequency Weight;
		MachineBasicBlock *Src;
		MachineBasicBlock *Dest;
		};

/// \brief work lists of blocks that are ready to be laid out		/// \brief work lists of blocks that are ready to be laid out
SmallVector<MachineBasicBlock *, 16> BlockWorkList;		SmallVector<MachineBasicBlock *, 16> BlockWorkList;
SmallVector<MachineBasicBlock *, 16> EHPadWorkList;		SmallVector<MachineBasicBlock *, 16> EHPadWorkList;

		/// Edges that have already been computed as optimal by the trellis code.
		DenseMap<const MachineBasicBlock , MachineBasicBlock > ComputedTrellisEdges;

/// \brief Machine Function		/// \brief Machine Function
MachineFunction *F;		MachineFunction *F;

/// \brief A handle to the branch probability pass.		/// \brief A handle to the branch probability pass.
const MachineBranchProbabilityInfo *MBPI;		const MachineBranchProbabilityInfo *MBPI;

/// \brief A handle to the function-wide block frequency pass.		/// \brief A handle to the function-wide block frequency pass.
std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;		std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
▲ Show 20 Lines • Show All 122 Lines • ▼ Show 20 Lines	void rotateLoop(
const BlockFilterSet &LoopBlockSet);		const BlockFilterSet &LoopBlockSet);
void rotateLoopWithProfile(		void rotateLoopWithProfile(
BlockChain &LoopChain, const MachineLoop &L,		BlockChain &LoopChain, const MachineLoop &L,
const BlockFilterSet &LoopBlockSet);		const BlockFilterSet &LoopBlockSet);
void collectMustExecuteBBs();		void collectMustExecuteBBs();
void buildCFGChains();		void buildCFGChains();
void optimizeBranches();		void optimizeBranches();
void alignBlocks();		void alignBlocks();
		/// Returns true if a block should be tail-duplicated to increase fallthrough
		/// opportunities.
bool shouldTailDuplicate(MachineBasicBlock *BB);		bool shouldTailDuplicate(MachineBasicBlock *BB);
/// Check the edge frequencies to see if tail duplication will increase		/// Check the edge frequencies to see if tail duplication will increase
/// fallthroughs.		/// fallthroughs.
bool isProfitableToTailDup(		bool isProfitableToTailDup(
const MachineBasicBlock BB, const MachineBasicBlock Succ,		const MachineBasicBlock BB, const MachineBasicBlock Succ,
BranchProbability AdjustedSumProb,		BranchProbability AdjustedSumProb,
const BlockChain &Chain, const BlockFilterSet *BlockFilter);		const BlockChain &Chain, const BlockFilterSet *BlockFilter);
		/// Check for a trellis layout.
		bool isTrellis(const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter);
		/// Get the best successor given a trellis layout.
		BlockAndTailDupResult getBestTrellisSuccessor(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		BranchProbability AdjustedSumProb, const BlockChain &Chain,
		const BlockFilterSet *BlockFilter);
		/// Get the best pair of non-conflicting edges.
		static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
		const MachineBasicBlock *BB,
		SmallVector<SmallVector<WeightedEdge, 8>, 2> &Edges);
/// Returns true if a block can tail duplicate into all unplaced		/// Returns true if a block can tail duplicate into all unplaced
/// predecessors. Filters based on loop.		/// predecessors. Filters based on loop.
bool canTailDuplicateUnplacedPreds(		bool canTailDuplicateUnplacedPreds(
const MachineBasicBlock BB, MachineBasicBlock Succ,		const MachineBasicBlock BB, MachineBasicBlock Succ,
const BlockChain &Chain, const BlockFilterSet *BlockFilter);		const BlockChain &Chain, const BlockFilterSet *BlockFilter);

public:		public:
static char ID; // Pass identification, replacement for typeid		static char ID; // Pass identification, replacement for typeid
▲ Show 20 Lines • Show All 151 Lines • ▼ Show 20 Lines	getAdjustedProbability(BranchProbability OrigProb,
if (SuccProbN >= SuccProbD)		if (SuccProbN >= SuccProbD)
SuccProb = BranchProbability::getOne();		SuccProb = BranchProbability::getOne();
else		else
SuccProb = BranchProbability(SuccProbN, SuccProbD);		SuccProb = BranchProbability(SuccProbN, SuccProbD);

return SuccProb;		return SuccProb;
}		}

/// Check if a block should be tail duplicated.		/// Check if \p BB has exactly the successors in \p Successors.
		static bool
		hasSameSuccessors(MachineBasicBlock &BB,
		SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
		if (BB.succ_size() != Successors.size())
		return false;
		// We don't want to count self-loops
		if (Successors.count(&BB))
		return false;
		for (MachineBasicBlock *Succ : BB.successors())
		if (!Successors.count(Succ))
		return false;
		return true;
		}

		/// Check if a block should be tail duplicated to increase fallthrough
		/// opportunities.
/// \p BB Block to check.		/// \p BB Block to check.
bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {		bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
// Blocks with single successors don't create additional fallthrough		// Blocks with single successors don't create additional fallthrough
// opportunities. Don't duplicate them. TODO: When conditional exits are		// opportunities. Don't duplicate them. TODO: When conditional exits are
// analyzable, allow them to be duplicated.		// analyzable, allow them to be duplicated.
bool IsSimple = TailDup.isSimpleBB(BB);		bool IsSimple = TailDup.isSimpleBB(BB);

if (BB->succ_size() == 1)		if (BB->succ_size() == 1)
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	bool MachineBlockPlacement::isProfitableToTailDup(
// BB BB		// BB BB
// \| \Qout \| \		// \| \Qout \| \
// P\| C \| =		// P\| C \| =
// = C' \| C		// = C' \| C
// \| /Qin \| \|		// \| /Qin \| \|
// \| / \| C' (+Succ)		// \| / \| C' (+Succ)
// Succ Succ /\|		// Succ Succ /\|
// / \ \| \/ \|		// / \ \| \/ \|
// U/ =V = /= =		// U/ =V \| == \|
// / \ \| / \\|		// / \ \| / \\|
// D E D E		// D E D E
// '=' : Branch taken for that CFG edge		// '=' : Branch taken for that CFG edge
// Cost in the first case is: P + V		// Cost in the first case is: P + V
// For this calculation, we always assume P > Qout. If Qout > P		// For this calculation, we always assume P > Qout. If Qout > P
// The result of this function will be ignored at the caller.		// The result of this function will be ignored at the caller.
// Cost in the second case is: Qout + Qin * V + P * U + P * V		// Cost in the second case is: Qout + Qin * U + P * V
// TODO(iteratee): If we lay out D after Succ, the P * U term
// goes away. This logic is coming in D28522.

if (PDom == nullptr \|\| !Succ->isSuccessor(PDom)) {		if (PDom == nullptr \|\| !Succ->isSuccessor(PDom)) {
BranchProbability UProb = BestSuccSucc;		BranchProbability UProb = BestSuccSucc;
BranchProbability VProb = AdjustedSuccSumProb - UProb;		BranchProbability VProb = AdjustedSuccSumProb - UProb;
BlockFrequency V = SuccFreq * VProb;		BlockFrequency V = SuccFreq * VProb;
BlockFrequency QinV = Qin * VProb;		BlockFrequency QinU = Qin * UProb;
BlockFrequency BaseCost = P + V;		BlockFrequency BaseCost = P + V;
BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;		BlockFrequency DupCost = Qout + QinU + P * VProb;
return greaterWithBias(BaseCost, DupCost, EntryFreq);		return greaterWithBias(BaseCost, DupCost, EntryFreq);
}		}
BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);		BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
BranchProbability VProb = AdjustedSuccSumProb - UProb;		BranchProbability VProb = AdjustedSuccSumProb - UProb;
BlockFrequency U = SuccFreq * UProb;		BlockFrequency U = SuccFreq * UProb;
BlockFrequency V = SuccFreq * VProb;		BlockFrequency V = SuccFreq * VProb;
// If there is a post-dominating successor, here is the calculation:		// If there is a post-dominating successor, here is the calculation:
// BB BB BB BB		// BB BB BB BB
// \| \Qout \| \ \| \Qout \| \		// \| \Qout \| \ \| \Qout \| \
// \|P C \| = \|P C \| =		// \|P C \| = \|P C \| =
// = C' \|P C = C' \|P C		// = C' \|P C = C' \|P C
// \| /Qin \| \| \| /Qin \| \|		// \| /Qin \| \| \| /Qin \| \|
// \| / \| C' (+Succ) \| / \| C' (+Succ)		// \| / \| C' (+Succ) \| / \| C' (+Succ)
// Succ Succ /\| Succ Succ /\|		// Succ Succ /\| Succ Succ /\|
// \| \ V \| \/ \| \| \ V \| \/ \|		// \| \ V \| \/ \| \| \ V \| \/ \|
// \|U \ \|U /\ \| \|U = \|U /\ \|		// \|U \ \|U /\ \| \|U = \|U /\ \|
// = D = = =\| \| D \| = =\|		// = D = = \= \| D \| = =\|
// \| / \|/ D \| / \|/ D		// \| / \|/ D \| / \|/ D
// \| / \| / \| = \| /		// \| / \| / \| = \| /
// \|/ \| / \|/ \| =		// \|/ \| / \|/ \| =
// Dom Dom Dom Dom		// Dom Dom Dom Dom
// '=' : Branch taken for that CFG edge		// '=' : Branch taken for that CFG edge
// The cost for taken branches in the first case is P + U		// The cost for taken branches in the first case is P + U
// The cost in the second case (assuming independence), given the layout:		// The cost in the second case (assuming independence), given the layout:
// BB, Succ, (C+Succ), D, Dom		// BB, Succ, (C+Succ), D, Dom
// is Qout + P * V + Qin * U		// is Qout + P * V + Qin * U
// compare P + U vs Qout + P + Qin * U.		// compare P + U vs Qout + P * U + Qin.
//		//
// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.		// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
//		//
// For the 3rd case, the cost is P + 2 * V		// For the 3rd case, the cost is P + 2 * V
// For the 4th case, the cost is Qout + Qin * U + P * V + V		// For the 4th case, the cost is Qout + Qin * U + P * V + V
// We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V		// We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
if (UProb > AdjustedSuccSumProb / 2		if (UProb > AdjustedSuccSumProb / 2 &&
&& !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],		!hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
UProb, UProb, Chain, BlockFilter)) {		Chain, BlockFilter))
// Cases 3 & 4		// Cases 3 & 4
return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),		return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
EntryFreq);		EntryFreq);
}
// Cases 1 & 2		// Cases 1 & 2
return greaterWithBias(		return greaterWithBias(
(P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);		(P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
		}

		/// Check for a trellis layout. \p BB is the upper part of a trellis if its
		/// successors form the lower part of a trellis. A successor set S forms the
		/// lower part of a trellis if all of the predecessors of S are either in S or
		/// have all of S as successors. We ignore trellises where BB doesn't have 2
		/// successors because for fewer than 2, it's trivial, and for 3 or greater they
		/// are very uncommon and complex to compute optimally. Allowing edges within S
		/// is not strictly a trellis, but the same algorithm works, so we allow it.
		bool MachineBlockPlacement::isTrellis(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
		// Technically BB could form a trellis with branching factor higher than 2.
		// But that's extremely uncommon.
		if (BB->succ_size() != 2 \|\| ViableSuccs.size() != 2)
		return false;

		SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
		BB->succ_end());
		// To avoid reviewing the same predecessors twice.
		SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;

		for (MachineBasicBlock *Succ : ViableSuccs) {
		int PredCount = 0;
		for (auto SuccPred : Succ->predecessors()) {
		// Allow triangle successors, but don't count them.
		if (Successors.count(SuccPred))
		continue;
		const BlockChain *PredChain = BlockToChain[SuccPred];
		if (SuccPred == BB \|\| (BlockFilter && !BlockFilter->count(SuccPred)) \|\|
		PredChain == &Chain \|\| PredChain == BlockToChain[Succ])
		continue;
		++PredCount;
		// Perform the successor check only once.
		if (!SeenPreds.insert(SuccPred).second)
		continue;
		if (!hasSameSuccessors(*SuccPred, Successors))
		return false;
		}
		// If one of the successors has only BB as a predecessor, it is not a
		// trellis.
		if (PredCount < 1)
		return false;
		}
		return true;
}		}

		/// Pick the highest total weight pair of edges that can both be laid out.
		/// The edges in \p Edges[0] are assumed to have a different destination than
		/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
		/// the individual highest weight edges to the 2 different destinations, or in
		/// case of a conflict, one of them should be replaced with a 2nd best edge.
		std::pair<MachineBlockPlacement::WeightedEdge,
		MachineBlockPlacement::WeightedEdge>
		MachineBlockPlacement::getBestNonConflictingEdges(
		const MachineBasicBlock *BB,
		SmallVector<SmallVector<MachineBlockPlacement::WeightedEdge, 8>, 2>
		&Edges) {
		// Sort the edges, and then for each successor, find the best incoming
		// predecessor. If the best incoming predecessors aren't the same,
		// then that is clearly the best layout. If there is a conflict, one of the
		// successors will have to fallthrough from the second best predecessor. We
		// compare which combination is better overall.

		// Sort for highest frequency.
		auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };

		std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
		std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
		auto BestA = Edges[0].begin();
		auto BestB = Edges[1].begin();
		// Arrange for the correct answer to be in BestA and BestB
		// If the 2 best edges don't conflict, the answer is already there.
		if (BestA->Src == BestB->Src) {
		// Compare the total fallthrough of (Best + Second Best) for both pairs
		auto SecondBestA = std::next(BestA);
		auto SecondBestB = std::next(BestB);
		BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
		BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
		if (BestAScore < BestBScore)
		BestA = SecondBestA;
		else
		BestB = SecondBestB;
		}
		// Arrange for the BB edge to be in BestA if it exists.
		if (BestB->Src == BB)
		std::swap(BestA, BestB);
		return std::make_pair(BestA, BestB);
		}

		/// Get the best successor from \p BB based on \p BB being part of a trellis.
		/// We only handle trellises with 2 successors, so the algorithm is
		/// straightforward: Find the best pair of edges that don't conflict. We find
		/// the best incoming edge for each successor in the trellis. If those conflict,
		/// we consider which of them should be replaced with the second best.
		/// Upon return the two best edges will be in \p BestEdges. If one of the edges
		/// comes from \p BB, it will be in \p BestEdges[0]
		MachineBlockPlacement::BlockAndTailDupResult
		MachineBlockPlacement::getBestTrellisSuccessor(
		const MachineBasicBlock *BB,
		const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
		BranchProbability AdjustedSumProb, const BlockChain &Chain,
		const BlockFilterSet *BlockFilter) {

		BlockAndTailDupResult Result = {nullptr, false};
		SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
		BB->succ_end());

		// We assume size 2 because it's common. For general n, we would have to do
		// the Hungarian algorithm, but it's not worth the complexity because more
		// than 2 successors is fairly uncommon, and a trellis even more so.
		if (Successors.size() != 2 \|\| ViableSuccs.size() != 2)
		return Result;

		// Collect the edge frequencies of all edges that form the trellis.
		SmallVector<SmallVector<WeightedEdge, 8>, 2> Edges(2);
		int SuccIndex = 0;
		for (auto Succ : ViableSuccs) {
		for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
		// Skip any placed predecessors that are not BB
		if (SuccPred != BB)
		if ((BlockFilter && !BlockFilter->count(SuccPred)) \|\|
		BlockToChain[SuccPred] == &Chain \|\|
		BlockToChain[SuccPred] == BlockToChain[Succ])
		continue;
		BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
		MBPI->getEdgeProbability(SuccPred, Succ);
		Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
		}
		++SuccIndex;
		}

		// Pick the best combination of 2 edges from all the edges in the trellis.
		WeightedEdge BestA, BestB;
		std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);

		if (BestA.Src != BB) {
		// If we have a trellis, and BB doesn't have the best fallthrough edges,
		// we shouldn't choose any successor. We've already looked and there's a
		// better fallthrough edge for all the successors.
		DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
		return Result;
		}

		// Did we pick the triangle edge? If tail-duplication is profitable, do
		// that instead. Otherwise merge the triangle edge now while we know it is
		// optimal.
		if (BestA.Dest == BestB.Src) {
		// The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
		// would be better.
		MachineBasicBlock *Succ1 = BestA.Dest;
		MachineBasicBlock *Succ2 = BestB.Dest;
		// Check to see if tail-duplication would be profitable.
		if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
		canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
		isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
		Chain, BlockFilter)) {
		DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
		MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
		dbgs() << " Selected: " << getBlockName(Succ2)
		<< ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
		Result.BB = Succ2;
		Result.ShouldTailDup = true;
		return Result;
		}
		}
		// We have already computed the optimal edge for the other side of the
		// trellis.
		ComputedTrellisEdges[BestB.Src] = BestB.Dest;

		auto TrellisSucc = BestA.Dest;
		DEBUG(BranchProbability SuccProb = getAdjustedProbability(
		MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
		dbgs() << " Selected: " << getBlockName(TrellisSucc)
		<< ", probability: " << SuccProb << " (Trellis)\n");
		Result.BB = TrellisSucc;
		return Result;
		}

/// When the option TailDupPlacement is on, this method checks if the		/// When the option TailDupPlacement is on, this method checks if the
/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated		/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
/// into all of its unplaced, unfiltered predecessors, that are not BB.		/// into all of its unplaced, unfiltered predecessors, that are not BB.
bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(		bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
const MachineBasicBlock BB, MachineBasicBlock Succ,		const MachineBasicBlock BB, MachineBasicBlock Succ,
const BlockChain &Chain, const BlockFilterSet *BlockFilter) {		const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
if (!shouldTailDuplicate(Succ))		if (!shouldTailDuplicate(Succ))
return false;		return false;

		// For CFG checking.
		SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
		BB->succ_end());
for (MachineBasicBlock *Pred : Succ->predecessors()) {		for (MachineBasicBlock *Pred : Succ->predecessors()) {
// Make sure all unplaced and unfiltered predecessors can be		// Make sure all unplaced and unfiltered predecessors can be
// tail-duplicated into.		// tail-duplicated into.
// Skip any blocks that are already placed or not in this loop.		// Skip any blocks that are already placed or not in this loop.
if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(Pred))		if (Pred == BB \|\| (BlockFilter && !BlockFilter->count(Pred))
\|\| BlockToChain[Pred] == &Chain)		\|\| BlockToChain[Pred] == &Chain)
continue;		continue;
if (!TailDup.canTailDuplicate(Succ, Pred))		if (!TailDup.canTailDuplicate(Succ, Pred)) {
		if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
		// This will result in a trellis after tail duplication, so we don't
		// need to copy Succ into this predecessor. In the presence
		// of a trellis tail duplication can continue to be profitable.
		// For example:
		// A A
		// \|\ \|\
		// \| \ \| \
		// \| C \| C+BB
		// \| / \| \|
		// \|/ \| \|
		// BB => BB \|
		// \|\ \|\/\|
		// \| \ \|/\\|
		// \| D \| D
		// \| / \| /
		// \|/ \|/
		// Succ Succ
		//
		// After BB was duplicated into C, the layout looks like the one on the
		// right. BB and C now have the same successors. When considering
		// whether Succ can be duplicated into all its unplaced predecessors, we
		// ignore C.
		// We can do this because C already has a profitable fallthrough, namely
		// D. TODO(iteratee): ignore sufficiently cold predecessors for
		// duplication and for this test.
		//
		// This allows trellises to be laid out in 2 separate chains
		// (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
		// because it allows the creation of 2 fallthrough paths with links
		// between them, and we correctly identify the best layout for these
		// CFGs. We want to extend trellises that the user created in addition
		// to trellises created by tail-duplication, so we just look for the
		// CFG.
		continue;
return false;		return false;
}		}
		}
return true;		return true;
}		}

/// When the option OutlineOptionalBranches is on, this method		/// When the option OutlineOptionalBranches is on, this method
/// checks if the fallthrough candidate block \p Succ (of block		/// checks if the fallthrough candidate block \p Succ (of block
/// \p BB) also has other unscheduled predecessor blocks which		/// \p BB) also has other unscheduled predecessor blocks which
/// are also successors of \p BB (forming triangular shape CFG).		/// are also successors of \p BB (forming triangular shape CFG).
/// If none of such predecessors are small, it returns true.		/// If none of such predecessors are small, it returns true.
▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines	bool MachineBlockPlacement::hasBetterLayoutPredecessor(
//		//
// S-------\| ---S		// S-------\| ---S
// \| \| \| \|		// \| \| \| \|
// ---BB \| \| BB		// ---BB \| \| BB
// \| \| \| \|		// \| \| \| \|
// \| Pred----\| \| S1----		// \| Pred----\| \| S1----
// \| \| \| \|		// \| \| \| \|
// --(S1 or S2) ---Pred--		// --(S1 or S2) ---Pred--
		// \|
		// S2
//		//
// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)		// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
// + min(freq(Pred->S1), freq(Pred->S2))		// + min(freq(Pred->S1), freq(Pred->S2))
// Non-topo-order cost:		// Non-topo-order cost:
// In the worst case, S2 will not get laid out after Pred.
// non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).		// non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))		// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
// is 0. Then the non topo layout is better when		// is 0. Then the non topo layout is better when
// freq(S->Pred) < freq(BB->S1).		// freq(S->Pred) < freq(BB->S1).
// This is exactly what is checked below.		// This is exactly what is checked below.
// Note there are other shapes that apply (Pred may not be a single block,		// Note there are other shapes that apply (Pred may not be a single block,
// but they all fit this general pattern.)		// but they all fit this general pattern.)
BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);		BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	MachineBlockPlacement::selectBestSuccessor(
auto BestProb = BranchProbability::getZero();		auto BestProb = BranchProbability::getZero();

SmallVector<MachineBasicBlock *, 4> Successors;		SmallVector<MachineBasicBlock *, 4> Successors;
auto AdjustedSumProb =		auto AdjustedSumProb =
collectViableSuccessors(BB, Chain, BlockFilter, Successors);		collectViableSuccessors(BB, Chain, BlockFilter, Successors);

DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");		DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");

		// if we already precomputed the best successor for BB as part of a trellis we
		// saw earlier, return that if still applicable.
		auto FoundEdge = ComputedTrellisEdges.find(BB);
		if (FoundEdge != ComputedTrellisEdges.end()) {
		MachineBasicBlock *Succ = FoundEdge->second;
		ComputedTrellisEdges.erase(FoundEdge);
		BlockChain *SuccChain = BlockToChain[Succ];
		if (BB->isSuccessor(Succ) && (!BlockFilter \|\| BlockFilter->count(Succ)) &&
		SuccChain != &Chain && Succ == *SuccChain->begin()) {
		BestSucc.BB = Succ;
		return BestSucc;
		}
		}

		// if BB is part of a trellis, Use the trellis to determine the optimal
		// fallthrough edges
		if (isTrellis(BB, Successors, Chain, BlockFilter))
		return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
		BlockFilter);

// For blocks with CFG violations, we may be able to lay them out anyway with		// For blocks with CFG violations, we may be able to lay them out anyway with
// tail-duplication. We keep this vector so we can perform the probability		// tail-duplication. We keep this vector so we can perform the probability
// calculations the minimum number of times.		// calculations the minimum number of times.
SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>		SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
DupCandidates;		DupCandidates;
for (MachineBasicBlock *Succ : Successors) {		for (MachineBasicBlock *Succ : Successors) {
auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);		auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
BranchProbability SuccProb =		BranchProbability SuccProb =
▲ Show 20 Lines • Show All 1,377 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/branch-relax-cbz.ll

	; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-cbz-offset-bits=3 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-apple-darwin -aarch64-cbz-offset-bits=3 < %s \| FileCheck %s

	; CHECK-LABEL: _split_block_no_fallthrough:			; CHECK-LABEL: _split_block_no_fallthrough:
	; CHECK: cmn x{{[0-9]+}}, #5			; CHECK: cmn x{{[0-9]+}}, #5
	; CHECK-NEXT: b.le [[B2:LBB[0-9]+_[0-9]+]]			; CHECK-NEXT: b.le [[B2:LBB[0-9]+_[0-9]+]]

	; CHECK-NEXT: ; BB#1: ; %b3			; CHECK-NEXT: ; BB#1: ; %b3
	; CHECK: ldr [[LOAD:w[0-9]+]]			; CHECK: ldr [[LOAD:w[0-9]+]]
	; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]			; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
	; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]

	; CHECK-NEXT: [[SKIP_LONG_B]]:
	; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]			; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]

				; CHECK-NEXT: [[B8]]: ; %b8
				; CHECK-NEXT: ret

	; CHECK-NEXT: [[B2]]: ; %b2			; CHECK-NEXT: [[B2]]: ; %b2
	; CHECK: mov w{{[0-9]+}}, #93			; CHECK: mov w{{[0-9]+}}, #93
	; CHECK: bl _extfunc			; CHECK: bl _extfunc
	; CHECK: cbz w{{[0-9]+}}, [[B7]]			; CHECK: cbz w{{[0-9]+}}, [[B7]]
				; CHECK-NEXT: b [[B8]]
	; CHECK-NEXT: [[B8]]: ; %b8
	; CHECK-NEXT: ret

	; CHECK-NEXT: [[B7]]: ; %b7			; CHECK-NEXT: [[B7]]: ; %b7
	; CHECK: mov w{{[0-9]+}}, #13			; CHECK: mov w{{[0-9]+}}, #13
	; CHECK: b _extfunc			; CHECK: b _extfunc

	define void @split_block_no_fallthrough(i64 %val) #0 {			define void @split_block_no_fallthrough(i64 %val) #0 {
	bb:			bb:
	%c0 = icmp sgt i64 %val, -5			%c0 = icmp sgt i64 %val, -5
	br i1 %c0, label %b3, label %b2			br i1 %c0, label %b3, label %b2

	b2:			b2:
	%v0 = tail call i32 @extfunc(i32 93)			%v0 = tail call i32 @extfunc(i32 93)
	%c1 = icmp eq i32 %v0, 0			%c1 = icmp eq i32 %v0, 0
	Show All 18 Lines

llvm/trunk/test/CodeGen/AArch64/combine-comparisons-by-cse.ll

	Show First 20 Lines • Show All 258 Lines • ▼ Show 20 Lines
	}			}

	; undefined external to prevent possible optimizations			; undefined external to prevent possible optimizations
	declare void @do_something() #1			declare void @do_something() #1

	define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {			define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
	; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ			; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
	; CHECK: cmn			; CHECK: cmn
	; CHECK: b.gt			; CHECK: b.le
	; CHECK: cmp			; CHECK: cmp
	; CHECK: b.gt			; CHECK: b.gt
	entry:			entry:
	%0 = load i32, i32* @a, align 4			%0 = load i32, i32* @a, align 4
	%cmp4 = icmp slt i32 %0, -1			%cmp4 = icmp slt i32 %0, -1
	br i1 %cmp4, label %while.body.preheader, label %while.end			br i1 %cmp4, label %while.body.preheader, label %while.end

	while.body.preheader: ; preds = %entry			while.body.preheader: ; preds = %entry
	▲ Show 20 Lines • Show All 199 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/optimize-cond-branch.ll

	; RUN: llc -verify-machineinstrs -o - %s \| FileCheck %s			; RUN: llc -verify-machineinstrs -o - %s \| FileCheck %s
	target triple = "arm64--"			target triple = "arm64--"

	; AArch64InstrInfo::optimizeCondBranch() optimizes the			; AArch64InstrInfo::optimizeCondBranch() optimizes the
	; "x = and y, 256; cmp x, 0; br" from an "and; cbnz" to a tbnz instruction.			; "x = and y, 256; cmp x, 0; br" from an "and; cbnz" to a tbnz instruction.
	; It forgot to clear the a flag resulting in a MachineVerifier complaint.			; It forgot to clear the a flag resulting in a MachineVerifier complaint.
	;			;
	; Writing a stable/simple test is tricky since most tbz instructions are already			; Writing a stable/simple test is tricky since most tbz instructions are already
	; formed in SelectionDAG, optimizeCondBranch() only triggers if the and			; formed in SelectionDAG, optimizeCondBranch() only triggers if the and
	; instruction is in a different block than the conditional jump.			; instruction is in a different block than the conditional jump.
	;			;
	; CHECK-LABEL: func			; CHECK-LABEL: func
	; CHECK-NOT: and			; CHECK-NOT: and
	; CHECK: tbnz			; CHECK: tbz
	define void @func() {			define void @func() {
	%c0 = icmp sgt i64 0, 0			%c0 = icmp sgt i64 0, 0
	br i1 %c0, label %b1, label %b6			br i1 %c0, label %b1, label %b6

	b1:			b1:
	br i1 undef, label %b3, label %b2			br i1 undef, label %b3, label %b2

	b2:			b2:
	Show All 26 Lines

llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll

	; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s			; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
	; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s			; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s

	; GCN-LABEL: {{^}}test_branch:			; GCN-LABEL: {{^}}test_branch:
	; GCNNOOPT: v_writelane_b32			; GCNNOOPT: v_writelane_b32
	; GCNNOOPT: v_writelane_b32			; GCNNOOPT: v_writelane_b32
	; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]


	; GCN: ; BB#1
	; GCNNOOPT: v_readlane_b32			; GCNNOOPT: v_readlane_b32
	; GCNNOOPT: v_readlane_b32			; GCNNOOPT: v_readlane_b32
	; GCN: buffer_store_dword			; GCN: buffer_store_dword
	; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)			; GCNNOOPT: s_endpgm
	; TODO: This waitcnt can be eliminated

	; GCN: {{^}}[[END]]:			; GCN: {{^}}[[END]]:
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {			define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
	%cmp = icmp ne i32 %val, 0			%cmp = icmp ne i32 %val, 0
	br i1 %cmp, label %store, label %end			br i1 %cmp, label %store, label %end

	store:			store:
	Show All 32 Lines

llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll

	Show First 20 Lines • Show All 485 Lines • ▼ Show 20 Lines

	ret:			ret:
	store volatile i32 7, i32 addrspace(1)* undef			store volatile i32 7, i32 addrspace(1)* undef
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}long_branch_hang:			; GCN-LABEL: {{^}}long_branch_hang:
	; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6			; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
	; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]			; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
				; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
	; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:			; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:

	; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(			; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
	; GCN: s_setpc_b64			; GCN: s_setpc_b64

	; GCN-NEXT: [[LONG_BR_0]]:			; GCN-NEXT: [[LONG_BR_0]]:
	; GCN-DAG: v_cmp_lt_i32			; GCN-DAG: v_cmp_lt_i32
	; GCN-DAG: v_cmp_gt_i32			; GCN-DAG: v_cmp_gt_i32
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s			; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s

	; GCN-LABEL: {{^}}test_loop:			; GCN-LABEL: {{^}}test_loop:
	; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:			; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
	; GCN: ds_read_b32			; GCN: ds_read_b32
	; GCN: ds_write_b32			; GCN: ds_write_b32
	; GCN: s_branch [[LABEL]]			; GCN: s_branch [[LABEL]]
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {			define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
	entry:			entry:
	%cmp = icmp eq i32 %n, -1			%cmp = icmp eq i32 %n, -1
	br i1 %cmp, label %for.exit, label %for.body			br i1 %cmp, label %for.exit, label %for.body
	▲ Show 20 Lines • Show All 110 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll

	Show All 23 Lines

	; GCN-LABEL: {{^}}nonconvergent_inlineasm:			; GCN-LABEL: {{^}}nonconvergent_inlineasm:
	; GCN: ; mask branch			; GCN: ; mask branch

	; GCN: BB{{[0-9]+_[0-9]+}}:			; GCN: BB{{[0-9]+_[0-9]+}}:
	; GCN: v_cmp_ne_u32_e64			; GCN: v_cmp_ne_u32_e64

	; GCN: BB{{[0-9]+_[0-9]+}}:			; GCN: BB{{[0-9]+_[0-9]+}}:

	define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {			define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
	bb:			bb:
	%tmp = call i32 @llvm.amdgcn.workitem.id.x()			%tmp = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)			%tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
	%tmp2 = icmp eq i32 %tmp, 8			%tmp2 = icmp eq i32 %tmp, 8
	br i1 %tmp2, label %bb3, label %bb5			br i1 %tmp2, label %bb3, label %bb5

	bb3: ; preds = %bb			bb3: ; preds = %bb
	Show All 10 Lines

llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll

	Show First 20 Lines • Show All 433 Lines • ▼ Show 20 Lines
	; {{^}}sopc_vopc_legalize_bug:			; {{^}}sopc_vopc_legalize_bug:
	; GCN: s_load_dword [[SGPR:s[0-9]+]]			; GCN: s_load_dword [[SGPR:s[0-9]+]]
	; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}			; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
	; GCN: s_and_b64 vcc, exec, vcc			; GCN: s_and_b64 vcc, exec, vcc
	; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]			; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
	; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1			; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
	; GCN-NOHSA: buffer_store_dword [[ONE]]			; GCN-NOHSA: buffer_store_dword [[ONE]]
	; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]			; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
	; GCN; {{^}}[[EXIT]]:			; GCN: {{^}}[[EXIT]]:
	; GCN: s_endpgm			; GCN: s_endpgm
	define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {			define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
	bb3: ; preds = %bb2			bb3: ; preds = %bb2
	%tmp0 = bitcast i32 %cond to float			%tmp0 = bitcast i32 %cond to float
	%tmp1 = fadd float %tmp0, 2.500000e-01			%tmp1 = fadd float %tmp0, 2.500000e-01
	%tmp2 = bitcast float %tmp1 to i32			%tmp2 = bitcast float %tmp1 to i32
	%tmp3 = icmp ult i32 %tmp2, %cond			%tmp3 = icmp ult i32 %tmp2, %cond
	br i1 %tmp3, label %bb6, label %bb7			br i1 %tmp3, label %bb6, label %bb7
	▲ Show 20 Lines • Show All 57 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

	; RUN: llc < %s -march=arm \| FileCheck %s			; RUN: llc < %s -march=arm \| FileCheck %s
	; RUN: llc < %s -march=arm -enable-tail-merge=0 \| \			; RUN: llc < %s -march=arm -enable-tail-merge=0 \| \
	; RUN: FileCheck --check-prefix=NOMERGE %s			; RUN: FileCheck --check-prefix=NOMERGE %s

	; Check that tail merging is the default on ARM, and that -enable-tail-merge=0			; Check that tail merging is the default on ARM, and that -enable-tail-merge=0
	; works.			; works.
	; PR1628			; PR1628

	; CHECK: bl _baz			; CHECK: bl _baz
	; CHECK-NOT: bl _baz			; CHECK-NOT: bl _baz

	; CHECK: bl _quux			; CHECK: bl _quux
	; CHECK-NOT: bl _quux			; CHECK-NOT: bl _quux

	; NOMERGE: bl _baz			; NOMERGE-DAG: bl _baz
	; NOMERGE: bl _baz			; NOMERGE-DAG: bl _baz

	; NOMERGE: bl _quux			; NOMERGE-DAG: bl _quux
	; NOMERGE: bl _quux			; NOMERGE-DAG: bl _quux

	; ModuleID = 'tail.c'			; ModuleID = 'tail.c'
	target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"			target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
	target triple = "i686-apple-darwin8"			target triple = "i686-apple-darwin8"

	define i32 @f(i32 %i, i32 %q) {			define i32 @f(i32 %i, i32 %q) {
	entry:			entry:
	%i_addr = alloca i32 ; <i32*> [#uses=2]			%i_addr = alloca i32 ; <i32*> [#uses=2]
	▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/atomic-cmpxchg.ll

	Show First 20 Lines • Show All 60 Lines • ▼ Show 20 Lines

	; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:			; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-ARMV7-NEXT: .fnstart			; CHECK-ARMV7-NEXT: .fnstart
	; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1			; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]			; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]
	; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:			; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
	; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]			; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0			; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
	; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1			; CHECK-ARMV7-NEXT: moveq r0, #1
	; CHECK-ARMV7-NEXT: bxeq lr			; CHECK-ARMV7-NEXT: bxeq lr
	; CHECK-ARMV7-NEXT: [[TRY]]:			; CHECK-ARMV7-NEXT: [[TRY]]:
	; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]			; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
	; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]			; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
	; CHECK-ARMV7-NEXT: beq [[HEAD]]			; CHECK-ARMV7-NEXT: beq [[HEAD]]
	; CHECK-ARMV7-NEXT: clrex			; CHECK-ARMV7-NEXT: clrex
	; CHECK-ARMV7-NEXT: mov [[RES]], #0			; CHECK-ARMV7-NEXT: mov r0, #0
	; CHECK-ARMV7-NEXT: bx lr			; CHECK-ARMV7-NEXT: bx lr

	; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:			; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
	; CHECK-THUMBV7-NEXT: .fnstart			; CHECK-THUMBV7-NEXT: .fnstart
	; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1			; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
	; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]			; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
	; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:			; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:
	; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]			; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
	Show All 11 Lines

llvm/trunk/test/CodeGen/ARM/fold-stack-adjust.ll

	Show First 20 Lines • Show All 129 Lines • ▼ Show 20 Lines

	; PR18136: there was a bug determining where the first eligible pop in a			; PR18136: there was a bug determining where the first eligible pop in a
	; basic-block was when the entire block was epilogue code.			; basic-block was when the entire block was epilogue code.
	define void @test_fold_point(i1 %tst) minsize {			define void @test_fold_point(i1 %tst) minsize {
	; CHECK-LABEL: test_fold_point:			; CHECK-LABEL: test_fold_point:

	; Important to check for beginning of basic block, because if it gets			; Important to check for beginning of basic block, because if it gets
	; if-converted the test is probably no longer checking what it should.			; if-converted the test is probably no longer checking what it should.
	; CHECK: {{LBB[0-9]+_2}}:			; CHECK: %end
	; CHECK-NEXT: vpop {d7, d8}			; CHECK-NEXT: vpop {d7, d8}
	; CHECK-NEXT: pop {r4, pc}			; CHECK-NEXT: pop {r4, pc}

	; With a guaranteed frame-pointer, we want to make sure that its offset in the			; With a guaranteed frame-pointer, we want to make sure that its offset in the
	; push block is correct, even if a few registers have been tacked onto a later			; push block is correct, even if a few registers have been tacked onto a later
	; vpush (PR18160).			; vpush (PR18160).
	; CHECK-IOS-LABEL: test_fold_point:			; CHECK-IOS-LABEL: test_fold_point:
	; CHECK-IOS: push {r4, r7, lr}			; CHECK-IOS: push {r4, r7, lr}
	▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/PowerPC/tail-dup-break-cfg.ll

Show All 10 Lines
; exit		; exit

;CHECK-LABEL: tail_dup_break_cfg:		;CHECK-LABEL: tail_dup_break_cfg:
;CHECK: mr [[TAGREG:[0-9]+]], 3		;CHECK: mr [[TAGREG:[0-9]+]], 3
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1		;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]		;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: # %test2		;CHECK-NEXT: # %test2
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30		;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]		;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]		;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
		;CHECK: blr
;CHECK-NEXT: [[BODY1LABEL]]		;CHECK-NEXT: [[BODY1LABEL]]
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30		;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
;CHECK-NEXT: beq 0, [[EXITLABEL]]		;CHECK-NEXT: beq 0, [[EXITLABEL]]
;CHECK-NEXT: [[BODY2LABEL]]		;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit		;CHECK: b [[EXITLABEL]]
;CHECK: blr
define void @tail_dup_break_cfg(i32 %tag) {		define void @tail_dup_break_cfg(i32 %tag) {
entry:		entry:
br label %test1		br label %test1
test1:		test1:
%tagbit1 = and i32 %tag, 1		%tagbit1 = and i32 %tag, 1
%tagbit1eq0 = icmp eq i32 %tagbit1, 0		%tagbit1eq0 = icmp eq i32 %tagbit1, 0
br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely		br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
body1:		body1:
Show All 39 Lines	body1:
call void @a()		call void @a()
call void @a()		call void @a()
call void @a()		call void @a()
call void @a()		call void @a()
br label %test2		br label %test2
test2:		test2:
%tagbit2 = and i32 %tag, 2		%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp ne i32 %tagbit2, 0		%tagbit2eq0 = icmp ne i32 %tagbit2, 0
br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely		br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
body2:		body2:
call void @b()		call void @b()
call void @b()		call void @b()
call void @b()		call void @b()
call void @b()		call void @b()
br label %exit		br label %exit
exit:		exit:
ret void		ret void
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	v:
br label %ret		br label %ret
ret:		ret:
ret void		ret void
}		}


!1 = !{!"branch_weights", i32 5, i32 3}		!1 = !{!"branch_weights", i32 5, i32 3}
!2 = !{!"branch_weights", i32 95, i32 5}		!2 = !{!"branch_weights", i32 95, i32 5}
!3 = !{!"branch_weights", i32 7, i32 3}		!3 = !{!"branch_weights", i32 8, i32 3}

llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll

	; RUN: llc -outline-optional-branches -O2 < %s \| FileCheck %s			; RUN: llc -O2 < %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-n32:64"			target datalayout = "e-m:e-i64:64-n32:64"
	target triple = "powerpc64le-grtev4-linux-gnu"			target triple = "powerpc64le-grtev4-linux-gnu"

	; Intended layout:			; Intended layout:
	; The outlining flag produces the layout			; The chain-based outlining produces the layout
	; test1			; test1
	; test2			; test2
	; test3			; test3
	; test4			; test4
	; exit
	; optional1			; optional1
	; optional2			; optional2
	; optional3			; optional3
	; optional4			; optional4
				; exit
	; Tail duplication puts test n+1 at the end of optional n			; Tail duplication puts test n+1 at the end of optional n
	; so optional1 includes a copy of test2 at the end, and branches			; so optional1 includes a copy of test2 at the end, and branches
	; to test3 (at the top) or falls through to optional 2.			; to test3 (at the top) or falls through to optional 2.
	; The CHECK statements check for the whole string of tests and exit block,			; The CHECK statements check for the whole string of tests
	; and then check that the correct test has been duplicated into the end of			; and then check that the correct test has been duplicated into the end of
	; the optional blocks and that the optional blocks are in the correct order.			; the optional blocks and that the optional blocks are in the correct order.
	;CHECK-LABEL: f:			;CHECK-LABEL: straight_test:
	; test1 may have been merged with entry			; test1 may have been merged with entry
	;CHECK: mr [[TAGREG:[0-9]+]], 3			;CHECK: mr [[TAGREG:[0-9]+]], 3
	;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1			;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
	;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2			;CHECK-NEXT: # %test2
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
	;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3			;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
	;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4			;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
	;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28			;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
	;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]			;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
	;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit			;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
	;CHECK: blr			;CHECK: blr
	;CHECK-NEXT: [[OPT1LABEL]]			;CHECK-NEXT: .[[OPT1LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
	;CHECK-NEXT: beq 0, [[TEST3LABEL]]			;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
	;CHECK-NEXT: [[OPT2LABEL]]			;CHECK-NEXT: .[[OPT2LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
	;CHECK-NEXT: beq 0, [[TEST4LABEL]]			;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
	;CHECK-NEXT: [[OPT3LABEL]]			;CHECK-NEXT: .[[OPT3LABEL]]:
	;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28			;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
	;CHECK-NEXT: beq 0, [[EXITLABEL]]			;CHECK-NEXT: beq 0, .[[EXITLABEL]]
	;CHECK-NEXT: [[OPT4LABEL]]			;CHECK-NEXT: .[[OPT4LABEL]]:
	;CHECK: b [[EXITLABEL]]			;CHECK: b .[[EXITLABEL]]

	define void @f(i32 %tag) {			define void @straight_test(i32 %tag) {
	entry:			entry:
	br label %test1			br label %test1
	test1:			test1:
	%tagbit1 = and i32 %tag, 1			%tagbit1 = and i32 %tag, 1
	%tagbit1eq0 = icmp eq i32 %tagbit1, 0			%tagbit1eq0 = icmp eq i32 %tagbit1, 0
	br i1 %tagbit1eq0, label %test2, label %optional1			br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
	optional1:			optional1:
	call void @a()			call void @a()
	call void @a()			call void @a()
	call void @a()			call void @a()
	call void @a()			call void @a()
	br label %test2			br label %test2
	test2:			test2:
	%tagbit2 = and i32 %tag, 2			%tagbit2 = and i32 %tag, 2
	%tagbit2eq0 = icmp eq i32 %tagbit2, 0			%tagbit2eq0 = icmp eq i32 %tagbit2, 0
	br i1 %tagbit2eq0, label %test3, label %optional2			br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
	optional2:			optional2:
	call void @b()			call void @b()
	call void @b()			call void @b()
	call void @b()			call void @b()
	call void @b()			call void @b()
	br label %test3			br label %test3
	test3:			test3:
	%tagbit3 = and i32 %tag, 4			%tagbit3 = and i32 %tag, 4
	%tagbit3eq0 = icmp eq i32 %tagbit3, 0			%tagbit3eq0 = icmp eq i32 %tagbit3, 0
	br i1 %tagbit3eq0, label %test4, label %optional3			br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
	optional3:			optional3:
	call void @c()			call void @c()
	call void @c()			call void @c()
	call void @c()			call void @c()
	call void @c()			call void @c()
	br label %test4			br label %test4
	test4:			test4:
	%tagbit4 = and i32 %tag, 8			%tagbit4 = and i32 %tag, 8
	%tagbit4eq0 = icmp eq i32 %tagbit4, 0			%tagbit4eq0 = icmp eq i32 %tagbit4, 0
	br i1 %tagbit4eq0, label %exit, label %optional4			br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
	optional4:			optional4:
	call void @d()			call void @d()
	call void @d()			call void @d()
	call void @d()			call void @d()
	call void @d()			call void @d()
	br label %exit			br label %exit
	exit:			exit:
	ret void			ret void
	}			}

				; Intended layout:
				; The chain-based outlining produces the layout
				; entry
				; --- Begin loop ---
				; for.latch
				; for.check
				; test1
				; test2
				; test3
				; test4
				; optional1
				; optional2
				; optional3
				; optional4
				; --- End loop ---
				; exit
				; The CHECK statements check for the whole string of tests and exit block,
				; and then check that the correct test has been duplicated into the end of
				; the optional blocks and that the optional blocks are in the correct order.
				;CHECK-LABEL: loop_test:
				;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
				;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
				;CHECK: addi
				;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
				;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
				;CHECK: # %test1
				;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
				;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: # %test2
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
				;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
				;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
				;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4\|optional3)}}
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
				;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
				;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
				;CHECK: [[OPT1LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
				;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
				;CHECK-NEXT: .[[OPT2LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
				;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
				;CHECK-NEXT: .[[OPT3LABEL]]
				;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
				;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
				;CHECK: [[OPT4LABEL]]:
				;CHECK: b .[[LATCHLABEL]]
				define void @loop_test(i32* %tags, i32 %count) {
				entry:
				br label %for.check
				for.check:
				%count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
				%done.count = icmp ugt i32 %count.loop, 0
				%tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
				%tag = load i32, i32* %tag_ptr
				%done.tag = icmp eq i32 %tag, 0
				%done = and i1 %done.count, %done.tag
				br i1 %done, label %test1, label %exit, !prof !1
				test1:
				%tagbit1 = and i32 %tag, 1
				%tagbit1eq0 = icmp eq i32 %tagbit1, 0
				br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
				optional1:
				call void @a()
				call void @a()
				call void @a()
				call void @a()
				br label %test2
				test2:
				%tagbit2 = and i32 %tag, 2
				%tagbit2eq0 = icmp eq i32 %tagbit2, 0
				br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
				optional2:
				call void @b()
				call void @b()
				call void @b()
				call void @b()
				br label %test3
				test3:
				%tagbit3 = and i32 %tag, 4
				%tagbit3eq0 = icmp eq i32 %tagbit3, 0
				br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
				optional3:
				call void @c()
				call void @c()
				call void @c()
				call void @c()
				br label %test4
				test4:
				%tagbit4 = and i32 %tag, 8
				%tagbit4eq0 = icmp eq i32 %tagbit4, 0
				br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
				optional4:
				call void @d()
				call void @d()
				call void @d()
				call void @d()
				br label %for.latch
				for.latch:
				%count.sub = sub i32 %count.loop, 1
				br label %for.check
				exit:
				ret void
				}

				; The block then2 is not unavoidable, meaning it does not dominate the exit.
				; But since it can be tail-duplicated, it should be placed as a fallthrough from
				; test2 and copied. The purpose here is to make sure that the tail-duplication
				; code is independent of the outlining code, which works by choosing the
				; "unavoidable" blocks.
				; CHECK-LABEL: avoidable_test:
				; CHECK: # %entry
				; CHECK: andi.
				; CHECK: # %test2
				; Make sure then2 falls through from test2
				; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
				; CHECK: # %then2
				; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
				; CHECK: # %else1
				; CHECK: bl a
				; CHECK: bl a
				; Make sure then2 was copied into else1
				; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
				; CHECK: # %end1
				; CHECK: bl d
				; CHECK: # %else2
				; CHECK: bl c
				; CHECK: # %end2
				define void @avoidable_test(i32 %tag) {
				entry:
				br label %test1
				test1:
				%tagbit1 = and i32 %tag, 1
				%tagbit1eq0 = icmp eq i32 %tagbit1, 0
				br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
				else1:
				call void @a()
				call void @a()
				br label %then2
				test2:
				%tagbit2 = and i32 %tag, 2
				%tagbit2eq0 = icmp eq i32 %tagbit2, 0
				br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
				then2:
				%tagbit3 = and i32 %tag, 4
				%tagbit3eq0 = icmp eq i32 %tagbit3, 0
				br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
				else2:
				call void @c()
				br label %end2
				end2:
				ret void
				end1:
				call void @d()
				ret void
				}

				; CHECK-LABEL: trellis_test
				; The number in the block labels is the expected block frequency given the
				; probabilities annotated. There is a conflict in the b;c->d;e trellis that
				; should be resolved as c->e;b->d.
				; The d;e->f;g trellis should be resolved as e->g;d->f.
				; The f;g->h;i trellis should be resolved as f->i;g->h.
				; The h;i->j;ret trellis contains a triangle edge, and should be resolved as
				; h->j->ret
				; CHECK: # %entry
				; CHECK: # %c10
				; CHECK: # %e9
				; CHECK: # %g10
				; CHECK: # %h10
				; CHECK: # %j8
				; CHECK: # %ret
				; CHECK: # %b6
				; CHECK: # %d7
				; CHECK: # %f6
				; CHECK: # %i6
				define void @trellis_test(i32 %tag) {
				entry:
				br label %a16
				a16:
				call void @a()
				call void @a()
				%tagbits.a = and i32 %tag, 3
				%tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
				br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6
				c10:
				call void @c()
				call void @c()
				%tagbits.c = and i32 %tag, 12
				%tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
				; Both of these edges should be hotter than the other incoming edge
				; for e9 or d7
				br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4
				e9:
				call void @e()
				call void @e()
				%tagbits.e = and i32 %tag, 48
				%tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
				br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2
				g10:
				call void @g()
				call void @g()
				%tagbits.g = and i32 %tag, 192
				%tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
				br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8
				i6:
				call void @i()
				call void @i()
				%tagbits.i = and i32 %tag, 768
				%tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0
				br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3)
				b6:
				call void @b()
				call void @b()
				%tagbits.b = and i32 %tag, 12
				%tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
				br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3)
				d7:
				call void @d()
				call void @d()
				%tagbits.d = and i32 %tag, 48
				%tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
				br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4
				f6:
				call void @f()
				call void @f()
				%tagbits.f = and i32 %tag, 192
				%tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128
				br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2
				h10:
				call void @h()
				call void @h()
				%tagbits.h = and i32 %tag, 768
				%tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512
				br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5)
				j8:
				call void @j()
				call void @j()
				br label %ret
				ret:
				ret void
				}

				; Verify that we still consider tail-duplication opportunities if we find a
				; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors
				; of both F and G. The basic trellis algorithm picks the F->G edge, but after
				; checking, it's profitable to duplicate G into F. The weights here are not
				; really important. They are there to help make the test stable.
				; CHECK-LABEL: trellis_then_dup_test
				; CHECK: # %entry
				; CHECK: # %b
				; CHECK: # %d
				; CHECK: # %g
				; CHECK: # %ret1
				; CHECK: # %c
				; CHECK: # %e
				; CHECK: # %f
				; CHECK: # %ret2
				; CHECK: # %ret
				define void @trellis_then_dup_test(i32 %tag) {
				entry:
				br label %a
				a:
				call void @a()
				call void @a()
				%tagbits.a = and i32 %tag, 3
				%tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
				br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3
				b:
				call void @b()
				call void @b()
				%tagbits.b = and i32 %tag, 12
				%tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
				br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3
				d:
				call void @d()
				call void @d()
				%tagbits.d = and i32 %tag, 48
				%tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
				br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3
				f:
				call void @f()
				call void @f()
				br label %g
				g:
				%tagbits.g = and i32 %tag, 192
				%tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
				br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
				c:
				call void @c()
				call void @c()
				%tagbits.c = and i32 %tag, 12
				%tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
				br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3
				e:
				call void @e()
				call void @e()
				%tagbits.e = and i32 %tag, 48
				%tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
				br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3
				ret1:
				call void @a()
				br label %ret
				ret2:
				call void @b()
				br label %ret
				ret:
				ret void
				}

	declare void @a()			declare void @a()
	declare void @b()			declare void @b()
	declare void @c()			declare void @c()
	declare void @d()			declare void @d()
				declare void @e()
				declare void @f()
				declare void @g()
				declare void @h()
				declare void @i()
				declare void @j()

				!1 = !{!"branch_weights", i32 5, i32 3}
				!2 = !{!"branch_weights", i32 50, i32 50}
				!3 = !{!"branch_weights", i32 6, i32 4}
				!4 = !{!"branch_weights", i32 7, i32 2}
				!5 = !{!"branch_weights", i32 2, i32 8}
				!6 = !{!"branch_weights", i32 3, i32 4}
				!7 = !{!"branch_weights", i32 4, i32 2}

llvm/trunk/test/CodeGen/SPARC/sjlj.ll

	Show First 20 Lines • Show All 61 Lines • ▼ Show 20 Lines
	; CHECK: st %i1, [%i0+4]			; CHECK: st %i1, [%i0+4]
	; CHECK: st %sp, [%i0+8]			; CHECK: st %sp, [%i0+8]
	; CHECK: bn .LBB1_2			; CHECK: bn .LBB1_2
	; CHECK: st %i7, [%i0+12]			; CHECK: st %i7, [%i0+12]
	; CHECK: ba .LBB1_1			; CHECK: ba .LBB1_1
	; CHECK: nop			; CHECK: nop
	; CHECK:.LBB1_1: ! %entry			; CHECK:.LBB1_1: ! %entry
	; CHECK: mov %g0, %i0			; CHECK: mov %g0, %i0
				; CHECK: ! %entry
	; CHECK: cmp %i0, 0			; CHECK: cmp %i0, 0
	; CHECK: bne .LBB1_4
	; CHECK: ba .LBB1_5
	; CHECK:.LBB1_2: ! Block address taken
	; CHECK: mov 1, %i0
	; CHECK: be .LBB1_5			; CHECK: be .LBB1_5
				; CHECK: nop
	; CHECK:.LBB1_4:			; CHECK:.LBB1_4:
				; CHECK: mov 1, %i0
	; CHECK: ba .LBB1_6			; CHECK: ba .LBB1_6
				; CHECK:.LBB1_2: ! Block address taken
				; CHECK: mov 1, %i0
				; CHECK: cmp %i0, 0
				; CHECK: bne .LBB1_4
				; CHECK: nop
	}			}
	declare i8* @llvm.frameaddress(i32) #2			declare i8* @llvm.frameaddress(i32) #2

	declare i8* @llvm.stacksave() #3			declare i8* @llvm.stacksave() #3

	declare i32 @llvm.eh.sjlj.setjmp(i8*) #3			declare i32 @llvm.eh.sjlj.setjmp(i8*) #3

	attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }			attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
	attributes #1 = { noreturn nounwind }			attributes #1 = { noreturn nounwind }
	attributes #2 = { nounwind readnone }			attributes #2 = { nounwind readnone }
	attributes #3 = { nounwind }			attributes #3 = { nounwind }

llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll

	; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0\| FileCheck %s			; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 \| FileCheck %s

	; Test memcpy, memmove, and memset intrinsics.			; Test memcpy, memmove, and memset intrinsics.

	target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"			target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
	target triple = "wasm32-unknown-unknown"			target triple = "wasm32-unknown-unknown"

	declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)			declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
	declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)			declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
	▲ Show 20 Lines • Show All 131 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/block-placement.ll

Show First 20 Lines • Show All 308 Lines • ▼ Show 20 Lines

exit:		exit:
ret i32 %sum		ret i32 %sum
}		}

define void @unnatural_cfg1() {		define void @unnatural_cfg1() {
; Test that we can handle a loop with an inner unnatural loop at the end of		; Test that we can handle a loop with an inner unnatural loop at the end of
; a function. This is a gross CFG reduced out of the single source GCC.		; a function. This is a gross CFG reduced out of the single source GCC.
; CHECK: unnatural_cfg1		; CHECK-LABEL: unnatural_cfg1
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop.body1		; CHECK: %loop.body1
; CHECK: %loop.body2		; CHECK: %loop.body2
; CHECK: %loop.body3		; CHECK: %loop.body3

entry:		entry:
br label %loop.header		br label %loop.header

Show All 21 Lines	loop.body5:
%ptr2 = load i32, i32* undef, align 4		%ptr2 = load i32, i32* undef, align 4
br label %loop.body3		br label %loop.body3
}		}

define void @unnatural_cfg2() {		define void @unnatural_cfg2() {
; Test that we can handle a loop with a nested natural loop and an unnatural		; Test that we can handle a loop with a nested natural loop and an unnatural
; loop. This was reduced from a crash on block placement when run over		; loop. This was reduced from a crash on block placement when run over
; single-source GCC.		; single-source GCC.
; CHECK: unnatural_cfg2		; CHECK-LABEL: unnatural_cfg2
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop.body1		; CHECK: %loop.body1
; CHECK: %loop.body2		; CHECK: %loop.body2
; CHECK: %loop.body3
; CHECK: %loop.inner1.begin
; The end block is folded with %loop.body3...
; CHECK-NOT: %loop.inner1.end
; CHECK: %loop.body4		; CHECK: %loop.body4
; CHECK: %loop.inner2.begin		; CHECK: %loop.inner2.begin
; The loop.inner2.end block is folded		; CHECK: %loop.inner2.begin
		; CHECK: %loop.body3
		; CHECK: %loop.inner1.begin
; CHECK: %loop.header		; CHECK: %loop.header
; CHECK: %bail		; CHECK: %bail

entry:		entry:
br label %loop.header		br label %loop.header

loop.header:		loop.header:
%comp0 = icmp eq i32* undef, null		%comp0 = icmp eq i32* undef, null
▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Lines
declare i32 @__gxx_personality_v0(...)		declare i32 @__gxx_personality_v0(...)

define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {		define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; Some times the landing pad ends up as the first successor of an invoke block.		; Some times the landing pad ends up as the first successor of an invoke block.
; When this happens, a strange result used to fall out of updateTerminators: we		; When this happens, a strange result used to fall out of updateTerminators: we
; didn't correctly locate the fallthrough successor, assuming blindly that the		; didn't correctly locate the fallthrough successor, assuming blindly that the
; first one was the fallthrough successor. As a result, we would add an		; first one was the fallthrough successor. As a result, we would add an
; erroneous jump to the landing pad thinking that was the default successor.		; erroneous jump to the landing pad thinking that was the default successor.
; CHECK: test_eh_lpad_successor		; CHECK-LABEL: test_eh_lpad_successor
; CHECK: %entry		; CHECK: %entry
; CHECK-NOT: jmp		; CHECK-NOT: jmp
; CHECK: %loop		; CHECK: %loop

entry:		entry:
invoke i32 @f() to label %preheader unwind label %lpad		invoke i32 @f() to label %preheader unwind label %lpad

preheader:		preheader:
Show All 11 Lines
declare void @fake_throw() noreturn		declare void @fake_throw() noreturn

define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {		define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
; For blocks containing a 'throw' (or similar functionality), we have		; For blocks containing a 'throw' (or similar functionality), we have
; a no-return invoke. In this case, only EH successors will exist, and		; a no-return invoke. In this case, only EH successors will exist, and
; fallthrough simply won't occur. Make sure we don't crash trying to update		; fallthrough simply won't occur. Make sure we don't crash trying to update
; terminators for such constructs.		; terminators for such constructs.
;		;
; CHECK: test_eh_throw		; CHECK-LABEL: test_eh_throw
; CHECK: %entry		; CHECK: %entry
; CHECK: %cleanup		; CHECK: %cleanup

entry:		entry:
invoke void @fake_throw() to label %continue unwind label %cleanup		invoke void @fake_throw() to label %continue unwind label %cleanup

continue:		continue:
unreachable		unreachable

cleanup:		cleanup:
%0 = landingpad { i8*, i32 }		%0 = landingpad { i8*, i32 }
cleanup		cleanup
unreachable		unreachable
}		}

define void @test_unnatural_cfg_backwards_inner_loop() {		define void @test_unnatural_cfg_backwards_inner_loop() {
; Test that when we encounter an unnatural CFG structure after having formed		; Test that when we encounter an unnatural CFG structure after having formed
; a chain for an inner loop which happened to be laid out backwards we don't		; a chain for an inner loop which happened to be laid out backwards we don't
; attempt to merge onto the wrong end of the inner loop just because we find it		; attempt to merge onto the wrong end of the inner loop just because we find it
; first. This was reduced from a crasher in GCC's single source.		; first. This was reduced from a crasher in GCC's single source.
;		;
; CHECK: test_unnatural_cfg_backwards_inner_loop		; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop2b		; CHECK: %loop2b
; CHECK: %loop1		; CHECK: %loop1

entry:		entry:
br i1 undef, label %loop2a, label %body		br i1 undef, label %loop2a, label %body

body:		body:
Show All 23 Lines

define void @unanalyzable_branch_to_loop_header() {		define void @unanalyzable_branch_to_loop_header() {
; Ensure that we can handle unanalyzable branches into loop headers. We		; Ensure that we can handle unanalyzable branches into loop headers. We
; pre-form chains for unanalyzable branches, and will find the tail end of that		; pre-form chains for unanalyzable branches, and will find the tail end of that
; at the start of the loop. This function uses floating point comparison		; at the start of the loop. This function uses floating point comparison
; fallthrough because that happens to always produce unanalyzable branches on		; fallthrough because that happens to always produce unanalyzable branches on
; x86.		; x86.
;		;
; CHECK: unanalyzable_branch_to_loop_header		; CHECK-LABEL: unanalyzable_branch_to_loop_header
; CHECK: %entry		; CHECK: %entry
; CHECK: %loop		; CHECK: %loop
; CHECK: %exit		; CHECK: %exit

entry:		entry:
%cmp = fcmp une double 0.000000e+00, undef		%cmp = fcmp une double 0.000000e+00, undef
br i1 %cmp, label %loop, label %exit		br i1 %cmp, label %loop, label %exit

loop:		loop:
%cond = icmp eq i8 undef, 42		%cond = icmp eq i8 undef, 42
br i1 %cond, label %exit, label %loop		br i1 %cond, label %exit, label %loop

exit:		exit:
ret void		ret void
}		}

define void @unanalyzable_branch_to_best_succ(i1 %cond) {		define void @unanalyzable_branch_to_best_succ(i1 %cond) {
; Ensure that we can handle unanalyzable branches where the destination block		; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the optimal successor to merge.		; gets selected as the optimal successor to merge.
;		;
; This branch is now analyzable and hence the destination block becomes the		; This branch is now analyzable and hence the destination block becomes the
; hotter one. The right order is entry->bar->exit->foo.		; hotter one. The right order is entry->bar->exit->foo.
;		;
; CHECK: unanalyzable_branch_to_best_succ		; CHECK-LABEL: unanalyzable_branch_to_best_succ
; CHECK: %entry		; CHECK: %entry
; CHECK: %bar		; CHECK: %bar
; CHECK: %exit		; CHECK: %exit
; CHECK: %foo		; CHECK: %foo

entry:		entry:
; Bias this branch toward bar to ensure we form that chain.		; Bias this branch toward bar to ensure we form that chain.
br i1 %cond, label %bar, label %foo, !prof !1		br i1 %cond, label %bar, label %foo, !prof !1
Show All 9 Lines
exit:		exit:
ret void		ret void
}		}

define void @unanalyzable_branch_to_free_block(float %x) {		define void @unanalyzable_branch_to_free_block(float %x) {
; Ensure that we can handle unanalyzable branches where the destination block		; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the best free block in the CFG.		; gets selected as the best free block in the CFG.
;		;
; CHECK: unanalyzable_branch_to_free_block		; CHECK-LABEL: unanalyzable_branch_to_free_block
; CHECK: %entry		; CHECK: %entry
; CHECK: %a		; CHECK: %a
; CHECK: %b		; CHECK: %b
; CHECK: %c		; CHECK: %c
; CHECK: %exit		; CHECK: %exit

entry:		entry:
br i1 undef, label %a, label %b		br i1 undef, label %a, label %b
Show All 13 Lines
exit:		exit:
ret void		ret void
}		}

define void @many_unanalyzable_branches() {		define void @many_unanalyzable_branches() {
; Ensure that we don't crash as we're building up many unanalyzable branches,		; Ensure that we don't crash as we're building up many unanalyzable branches,
; blocks, and loops.		; blocks, and loops.
;		;
; CHECK: many_unanalyzable_branches		; CHECK-LABEL: many_unanalyzable_branches
; CHECK: %entry		; CHECK: %entry
; CHECK: %exit		; CHECK: %exit

entry:		entry:
br label %0		br label %0

%val0 = load volatile float, float* undef		%val0 = load volatile float, float* undef
%cmp0 = fcmp une float %val0, undef		%cmp0 = fcmp une float %val0, undef
▲ Show 20 Lines • Show All 202 Lines • ▼ Show 20 Lines
; 1) Loop rotation needs to ensure that the desired exiting edge can be		; 1) Loop rotation needs to ensure that the desired exiting edge can be
; a fallthrough.		; a fallthrough.
; 2) The exiting edge from the loop which is rotated to be laid out at the		; 2) The exiting edge from the loop which is rotated to be laid out at the
; bottom of the loop needs to be exiting into the nearest enclosing loop (to		; bottom of the loop needs to be exiting into the nearest enclosing loop (to
; which there is an exit). Otherwise, we force that enclosing loop into		; which there is an exit). Otherwise, we force that enclosing loop into
; strange layouts that are siginificantly less efficient, often times maing		; strange layouts that are siginificantly less efficient, often times maing
; it discontiguous.		; it discontiguous.
;		;
; CHECK: @benchmark_heapsort		; CHECK-LABEL: @benchmark_heapsort
; CHECK: %entry		; CHECK: %entry
; First rotated loop top.		; First rotated loop top.
; CHECK: .p2align		; CHECK: .p2align
; CHECK: %while.end		; CHECK: %while.end
; %for.cond gets completely tail-duplicated away.		; %for.cond gets completely tail-duplicated away.
; CHECK: %if.then		; CHECK: %if.then
; CHECK: %if.else		; CHECK: %if.else
; CHECK: %if.end10		; CHECK: %if.end10
▲ Show 20 Lines • Show All 505 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/bypass-slow-division-32.ll

	Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: testl $-256, %edi			; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: je .LBB3_1			; CHECK-NEXT: je .LBB3_1
	; CHECK-NEXT: # BB#2:			; CHECK-NEXT: # BB#2:
	; CHECK-NEXT: movl %ecx, %eax			; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: cltd			; CHECK-NEXT: cltd
	; CHECK-NEXT: idivl %ebx			; CHECK-NEXT: idivl %ebx
	; CHECK-NEXT: movl %eax, %esi			; CHECK-NEXT: movl %eax, %esi
	; CHECK-NEXT: testl $-256, %edi			; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: jne .LBB3_5
	; CHECK-NEXT: jmp .LBB3_4
	; CHECK-NEXT: .LBB3_1:
	; CHECK-NEXT: movzbl %cl, %eax
	; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
	; CHECK-NEXT: divb %bl
	; CHECK-NEXT: movzbl %al, %esi
	; CHECK-NEXT: testl $-256, %edi
	; CHECK-NEXT: je .LBB3_4			; CHECK-NEXT: je .LBB3_4
	; CHECK-NEXT: .LBB3_5:			; CHECK-NEXT: .LBB3_5:
	; CHECK-NEXT: xorl %edx, %edx			; CHECK-NEXT: xorl %edx, %edx
	; CHECK-NEXT: movl %ecx, %eax			; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: divl %ebx			; CHECK-NEXT: divl %ebx
	; CHECK-NEXT: jmp .LBB3_6			; CHECK-NEXT: jmp .LBB3_6
				; CHECK-NEXT: .LBB3_1:
				; CHECK-NEXT: movzbl %cl, %eax
				; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
				; CHECK-NEXT: divb %bl
				; CHECK-NEXT: movzbl %al, %esi
				; CHECK-NEXT: testl $-256, %edi
				; CHECK-NEXT: jne .LBB3_5
	; CHECK-NEXT: .LBB3_4:			; CHECK-NEXT: .LBB3_4:
	; CHECK-NEXT: movzbl %cl, %eax			; CHECK-NEXT: movzbl %cl, %eax
	; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>			; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
	; CHECK-NEXT: divb %bl			; CHECK-NEXT: divb %bl
	; CHECK-NEXT: movzbl %al, %eax			; CHECK-NEXT: movzbl %al, %eax
	; CHECK-NEXT: .LBB3_6:			; CHECK-NEXT: .LBB3_6:
	; CHECK-NEXT: addl %eax, %esi			; CHECK-NEXT: addl %eax, %esi
	; CHECK-NEXT: movl %esi, %eax			; CHECK-NEXT: movl %esi, %eax
	▲ Show 20 Lines • Show All 121 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/sse1.ll

	Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines
	; X32: # BB#0: # %entry			; X32: # BB#0: # %entry
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: xorps %xmm0, %xmm0			; X32-NEXT: xorps %xmm0, %xmm0
	; X32-NEXT: je .LBB1_1			; X32-NEXT: je .LBB1_1
	; X32-NEXT: # BB#2: # %entry			; X32-NEXT: # BB#2: # %entry
	; X32-NEXT: xorps %xmm1, %xmm1			; X32-NEXT: xorps %xmm1, %xmm1
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: jne .LBB1_5			; X32-NEXT: jne .LBB1_5
	; X32-NEXT: jmp .LBB1_4			; X32-NEXT: .LBB1_4:
				; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
				; X32-NEXT: jne .LBB1_8
				; X32-NEXT: .LBB1_7:
				; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
				; X32-NEXT: jmp .LBB1_9
	; X32-NEXT: .LBB1_1:			; X32-NEXT: .LBB1_1:
	; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: je .LBB1_4			; X32-NEXT: je .LBB1_4
	; X32-NEXT: .LBB1_5: # %entry			; X32-NEXT: .LBB1_5: # %entry
	; X32-NEXT: xorps %xmm2, %xmm2			; X32-NEXT: xorps %xmm2, %xmm2
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: jne .LBB1_8
	; X32-NEXT: jmp .LBB1_7
	; X32-NEXT: .LBB1_4:
	; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: je .LBB1_7			; X32-NEXT: je .LBB1_7
	; X32-NEXT: .LBB1_8: # %entry			; X32-NEXT: .LBB1_8: # %entry
	; X32-NEXT: xorps %xmm3, %xmm3			; X32-NEXT: xorps %xmm3, %xmm3
	; X32-NEXT: jmp .LBB1_9
	; X32-NEXT: .LBB1_7:
	; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X32-NEXT: .LBB1_9: # %entry			; X32-NEXT: .LBB1_9: # %entry
	; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)			; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
	; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]			; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X32-NEXT: jne .LBB1_11			; X32-NEXT: jne .LBB1_11
	; X32-NEXT: # BB#10:			; X32-NEXT: # BB#10:
	; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-NEXT: .LBB1_11: # %entry			; X32-NEXT: .LBB1_11: # %entry
	; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]			; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: vselect:			; X64-LABEL: vselect:
	; X64: # BB#0: # %entry			; X64: # BB#0: # %entry
	; X64-NEXT: testl %ecx, %ecx			; X64-NEXT: testl %ecx, %ecx
	; X64-NEXT: xorps %xmm0, %xmm0			; X64-NEXT: xorps %xmm0, %xmm0
	; X64-NEXT: je .LBB1_1			; X64-NEXT: je .LBB1_1
	; X64-NEXT: # BB#2: # %entry			; X64-NEXT: # BB#2: # %entry
	; X64-NEXT: xorps %xmm1, %xmm1			; X64-NEXT: xorps %xmm1, %xmm1
	; X64-NEXT: testl %edx, %edx			; X64-NEXT: testl %edx, %edx
	; X64-NEXT: jne .LBB1_5			; X64-NEXT: jne .LBB1_5
	; X64-NEXT: jmp .LBB1_4			; X64-NEXT: .LBB1_4:
				; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X64-NEXT: testl %r8d, %r8d
				; X64-NEXT: jne .LBB1_8
				; X64-NEXT: .LBB1_7:
				; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
				; X64-NEXT: jmp .LBB1_9
	; X64-NEXT: .LBB1_1:			; X64-NEXT: .LBB1_1:
	; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero			; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
	; X64-NEXT: testl %edx, %edx			; X64-NEXT: testl %edx, %edx
	; X64-NEXT: je .LBB1_4			; X64-NEXT: je .LBB1_4
	; X64-NEXT: .LBB1_5: # %entry			; X64-NEXT: .LBB1_5: # %entry
	; X64-NEXT: xorps %xmm2, %xmm2			; X64-NEXT: xorps %xmm2, %xmm2
	; X64-NEXT: testl %r8d, %r8d			; X64-NEXT: testl %r8d, %r8d
	; X64-NEXT: jne .LBB1_8
	; X64-NEXT: jmp .LBB1_7
	; X64-NEXT: .LBB1_4:
	; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
	; X64-NEXT: testl %r8d, %r8d
	; X64-NEXT: je .LBB1_7			; X64-NEXT: je .LBB1_7
	; X64-NEXT: .LBB1_8: # %entry			; X64-NEXT: .LBB1_8: # %entry
	; X64-NEXT: xorps %xmm3, %xmm3			; X64-NEXT: xorps %xmm3, %xmm3
	; X64-NEXT: jmp .LBB1_9
	; X64-NEXT: .LBB1_7:
	; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB1_9: # %entry			; X64-NEXT: .LBB1_9: # %entry
	; X64-NEXT: testl %esi, %esi			; X64-NEXT: testl %esi, %esi
	; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]			; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X64-NEXT: jne .LBB1_11			; X64-NEXT: jne .LBB1_11
	; X64-NEXT: # BB#10:			; X64-NEXT: # BB#10:
	; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero			; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X64-NEXT: .LBB1_11: # %entry			; X64-NEXT: .LBB1_11: # %entry
	; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]			; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
	▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines
	; X64-NEXT: movl %eax, (%rdi)			; X64-NEXT: movl %eax, (%rdi)
	; X64-NEXT: movq %rdi, %rax			; X64-NEXT: movq %rdi, %rax
	; X64-NEXT: retq			; X64-NEXT: retq
	%cmp = icmp eq <4 x i32> %x, %y			%cmp = icmp eq <4 x i32> %x, %y
	%zext = zext <4 x i1> %cmp to <4 x i32>			%zext = zext <4 x i1> %cmp to <4 x i32>
	ret <4 x i32> %zext			ret <4 x i32> %zext
	}			}

	; Fragile test warning - we need to induce the generation of a vselect			; Fragile test warning - we need to induce the generation of a vselect
	; post-legalization to cause the crash seen in:			; post-legalization to cause the crash seen in:
	; https://llvm.org/bugs/show_bug.cgi?id=31672			; https://llvm.org/bugs/show_bug.cgi?id=31672
	; Is there a way to do that without an unsafe/fast sqrt intrinsic call?			; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
	; Also, although the goal for adding this test is to prove that we			; Also, although the goal for adding this test is to prove that we
	; don't crash, I have no idea what this code is doing, so I'm keeping			; don't crash, I have no idea what this code is doing, so I'm keeping
	; the full codegen checks in case there's motivation to improve this.			; the full codegen checks in case there's motivation to improve this.

	define <2 x float> @PR31672() #0 {			define <2 x float> @PR31672() #0 {
	▲ Show 20 Lines • Show All 124 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll

	; RUN: llc -O2 -o - %s \| FileCheck %s			; RUN: llc -O2 -o - %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: nounwind uwtable			; Function Attrs: nounwind uwtable
	; CHECK-LABEL: tail_dup_merge_loops			; CHECK-LABEL: tail_dup_merge_loops
	; CHECK: # %entry			; CHECK: # %entry
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
				; CHECK: # %exit
				; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_exit			; CHECK: # %inner_loop_exit
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_latch			; CHECK: # %inner_loop_latch
	; CHECK-NOT: # %{{[a-zA-Z_]+}}			; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %inner_loop_test			; CHECK: # %inner_loop_test
	; CHECK-NOT: # %{{[a-zA-Z_]+}}
	; CHECK: # %exit
	define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {			define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
	entry:			entry:
	%notlhs674.i = icmp eq i32 %a, 0			%notlhs674.i = icmp eq i32 %a, 0
	br label %outer_loop_top			br label %outer_loop_top

	outer_loop_top: ; preds = %inner_loop_exit, %entry			outer_loop_top: ; preds = %inner_loop_exit, %entry
	%dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]			%dst.0.ph.i = phi i8* [ %b, %entry ], [ %scevgep679.i, %inner_loop_exit ]
	br i1 %notlhs674.i, label %exit, label %inner_loop_preheader			br i1 %notlhs674.i, label %exit, label %inner_loop_preheader
	▲ Show 20 Lines • Show All 167 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll

	; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s \| FileCheck %s			; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s \| FileCheck %s
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Function Attrs: uwtable			; Function Attrs: uwtable
	; When tail-duplicating during placement, we work backward from blocks with			; When tail-duplicating during placement, we work backward from blocks with
	; multiple successors. In this case, the block dup1 gets duplicated into dup2			; multiple successors. In this case, the block dup1 gets duplicated into dup2
	; and if.then64, and then the block dup2 gets duplicated into land.lhs.true			; and if.then64, and then the block dup2 gets duplicated into land.lhs.true
	; and if.end70			; and if.end70
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/tail-opts.ll

	Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines
	; with only a branch in common, regardless of the fallthrough situation.			; with only a branch in common, regardless of the fallthrough situation.

	; CHECK-LABEL: dont_merge_oddly:			; CHECK-LABEL: dont_merge_oddly:
	; CHECK-NOT: ret			; CHECK-NOT: ret
	; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: jbe .LBB2_3			; CHECK-NEXT: jbe .LBB2_3
	; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: ja .LBB2_4			; CHECK-NEXT: ja .LBB2_4
	; CHECK-NEXT: jmp .LBB2_2			; CHECK-NEXT: .LBB2_2:
				; CHECK-NEXT: movb $1, %al
				; CHECK-NEXT: ret
	; CHECK-NEXT: .LBB2_3:			; CHECK-NEXT: .LBB2_3:
	; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}			; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
	; CHECK-NEXT: jbe .LBB2_2			; CHECK-NEXT: jbe .LBB2_2
	; CHECK-NEXT: .LBB2_4:			; CHECK-NEXT: .LBB2_4:
	; CHECK-NEXT: xorl %eax, %eax			; CHECK-NEXT: xorl %eax, %eax
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	; CHECK-NEXT: .LBB2_2:
	; CHECK-NEXT: movb $1, %al
	; CHECK-NEXT: ret

	define i1 @dont_merge_oddly(float* %result) nounwind {			define i1 @dont_merge_oddly(float* %result) nounwind {
	entry:			entry:
	%tmp4 = getelementptr float, float* %result, i32 2			%tmp4 = getelementptr float, float* %result, i32 2
	%tmp5 = load float, float* %tmp4, align 4			%tmp5 = load float, float* %tmp4, align 4
	%tmp7 = getelementptr float, float* %result, i32 4			%tmp7 = getelementptr float, float* %result, i32 4
	%tmp8 = load float, float* %tmp7, align 4			%tmp8 = load float, float* %tmp7, align 4
	%tmp10 = getelementptr float, float* %result, i32 6			%tmp10 = getelementptr float, float* %result, i32 6
	▲ Show 20 Lines • Show All 425 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll

Show All 13 Lines	entry:
br i1 %cmp3, label %for.body.lr.ph, label %for.end		br i1 %cmp3, label %for.body.lr.ph, label %for.end

for.body.lr.ph: ; preds = %entry		for.body.lr.ph: ; preds = %entry
%total.promoted = load i32, i32* @total, align 4		%total.promoted = load i32, i32* @total, align 4
br label %for.body		br label %for.body

; Check that only one mov will be generated in the kernel loop.		; Check that only one mov will be generated in the kernel loop.
; CHECK-LABEL: foo:		; CHECK-LABEL: foo:
; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body		; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]		; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: shrl $31, [[REG1]]		; CHECK: shrl $31, [[REG1]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: jl [[LOOP1]]		; CHECK: jl [[LOOP1]]
for.body: ; preds = %for.body.lr.ph, %for.body		for.body: ; preds = %for.body.lr.ph, %for.body
%add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]		%add5 = phi i32 [ %total.promoted, %for.body.lr.ph ], [ %add, %for.body ]
Show All 20 Lines	entry:
br i1 %cmp3, label %for.body.lr.ph, label %for.end		br i1 %cmp3, label %for.body.lr.ph, label %for.end

for.body.lr.ph: ; preds = %entry		for.body.lr.ph: ; preds = %entry
%total.promoted = load i32, i32* @total, align 4		%total.promoted = load i32, i32* @total, align 4
br label %for.body		br label %for.body

; Check that only two mov will be generated in the kernel loop.		; Check that only two mov will be generated in the kernel loop.
; CHECK-LABEL: goo:		; CHECK-LABEL: goo:
; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body		; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]		; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: shrl $31, [[REG2]]		; CHECK: shrl $31, [[REG2]]
; CHECK-NOT: mov		; CHECK-NOT: mov
; CHECK: movl {{.*}}		; CHECK: movl {{.*}}
; CHECK: jl [[LOOP2]]		; CHECK: jl [[LOOP2]]
for.body: ; preds = %for.body.lr.ph, %for.body		for.body: ; preds = %for.body.lr.ph, %for.body
Show All 17 Lines

llvm/trunk/test/CodeGen/X86/win-alloca-expander.ll

Show First 20 Lines • Show All 109 Lines • ▼ Show 20 Lines	; CHECK: calll __chkstk
call void @f(%struct.S* %p0)		call void @f(%struct.S* %p0)
ret void		ret void
}		}

define void @cfg(i1 %x, i1 %y) {		define void @cfg(i1 %x, i1 %y) {
; Test that the blocks are analyzed in the correct order.		; Test that the blocks are analyzed in the correct order.
; CHECK-LABEL: cfg:		; CHECK-LABEL: cfg:
entry:		entry:
br i1 %x, label %bb1, label %bb2		br i1 %x, label %bb1, label %bb3

bb1:		bb1:
%p1 = alloca %struct.S		%p1 = alloca %struct.S
; CHECK: pushl %eax		; CHECK: pushl %eax
; CHECK: subl $1020, %esp		; CHECK: subl $1020, %esp
br label %bb3		br label %bb4

bb2:		bb2:
%p2 = alloca %struct.T		%p5 = alloca %struct.T
; CHECK: pushl %eax		; CHECK: pushl %eax
; CHECK: subl $2996, %esp		; CHECK: subl $2996, %esp
br label %bb3		call void @g(%struct.T* %p5)
		ret void

bb3:		bb3:
br i1 %y, label %bb4, label %bb5		%p2 = alloca %struct.T
		; CHECK: pushl %eax
		; CHECK: subl $2996, %esp
		br label %bb4

bb4:		bb4:
		br i1 %y, label %bb5, label %bb2

		bb5:
%p4 = alloca %struct.S		%p4 = alloca %struct.S
; CHECK: subl $1024, %esp		; CHECK: subl $1024, %esp
call void @f(%struct.S* %p4)		call void @f(%struct.S* %p4)
ret void		ret void

bb5:
%p5 = alloca %struct.T
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
call void @g(%struct.T* %p5)
ret void
}		}


declare void @f(%struct.S*)		declare void @f(%struct.S*)
declare void @g(%struct.T*)		declare void @g(%struct.T*)
declare void @h(%struct.U*)		declare void @h(%struct.U*)

declare i8* @llvm.stacksave()		declare i8* @llvm.stacksave()
declare void @llvm.stackrestore(i8*)		declare void @llvm.stackrestore(i8*)

This is an archive of the discontinued LLVM Phabricator instance.

Codegen: Make chains from trellis-shaped CFGsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 88586

llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp

llvm/trunk/test/CodeGen/AArch64/branch-relax-cbz.ll

llvm/trunk/test/CodeGen/AArch64/combine-comparisons-by-cse.ll

llvm/trunk/test/CodeGen/AArch64/optimize-cond-branch.ll

llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll

llvm/trunk/test/CodeGen/AMDGPU/branch-relaxation.ll

llvm/trunk/test/CodeGen/AMDGPU/cf-loop-on-constant.ll

llvm/trunk/test/CodeGen/AMDGPU/convergent-inlineasm.ll

llvm/trunk/test/CodeGen/AMDGPU/salu-to-valu.ll

llvm/trunk/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

llvm/trunk/test/CodeGen/ARM/atomic-cmpxchg.ll

llvm/trunk/test/CodeGen/ARM/fold-stack-adjust.ll

llvm/trunk/test/CodeGen/PowerPC/tail-dup-break-cfg.ll

llvm/trunk/test/CodeGen/PowerPC/tail-dup-layout.ll

llvm/trunk/test/CodeGen/SPARC/sjlj.ll

llvm/trunk/test/CodeGen/WebAssembly/mem-intrinsics.ll

llvm/trunk/test/CodeGen/X86/block-placement.ll

llvm/trunk/test/CodeGen/X86/bypass-slow-division-32.ll

llvm/trunk/test/CodeGen/X86/sse1.ll

llvm/trunk/test/CodeGen/X86/tail-dup-merge-loop-headers.ll

llvm/trunk/test/CodeGen/X86/tail-dup-repeat.ll

llvm/trunk/test/CodeGen/X86/tail-opts.ll

llvm/trunk/test/CodeGen/X86/twoaddr-coalesce-3.ll

llvm/trunk/test/CodeGen/X86/win-alloca-expander.ll

Codegen: Make chains from trellis-shaped CFGs
ClosedPublic