Skip to content

Commit 551a57f

Browse files
author
Cong Hou
committedJan 26, 2016
Allow X86::COND_NE_OR_P and X86::COND_NP_OR_E to be reversed.
Currently, AnalyzeBranch() fails non-equality comparison between floating points on X86 (see https://llvm.org/bugs/show_bug.cgi?id=23875). This is because this function can modify the branch by reversing the conditional jump and removing unconditional jump if there is a proper fall-through. However, in the case of non-equality comparison between floating points, this can turn the branch "unanalyzable". Consider the following case: jne.BB1 jp.BB1 jmp.BB2 .BB1: ... .BB2: ... AnalyzeBranch() will reverse "jp .BB1" to "jnp .BB2" and then "jmp .BB2" will be removed: jne.BB1 jnp.BB2 .BB1: ... .BB2: ... However, AnalyzeBranch() cannot analyze this branch anymore as there are two conditional jumps with different targets. This may disable some optimizations like block-placement: in this case the fall-through behavior is enforced even if the fall-through block is very cold, which is suboptimal. Actually this optimization is also done in block-placement pass, which means we can remove this optimization from AnalyzeBranch(). However, currently X86::COND_NE_OR_P and X86::COND_NP_OR_E are not reversible: there is no defined negation conditions for them. In order to reverse them, this patch defines two new CondCode X86::COND_E_AND_NP and X86::COND_P_AND_NE. It also defines how to synthesize instructions for them. Here only the second conditional jump is reversed. This is valid as we only need them to do this "unconditional jump removal" optimization. Differential Revision: http://reviews.llvm.org/D11393 llvm-svn: 258847
1 parent 7423715 commit 551a57f

File tree

6 files changed

+161
-70
lines changed

6 files changed

+161
-70
lines changed
 

Diff for: ‎llvm/lib/Target/X86/X86InstrInfo.cpp

+75-19
Original file line numberDiff line numberDiff line change
@@ -3805,6 +3805,10 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
38053805
case X86::COND_NP: return X86::COND_P;
38063806
case X86::COND_O: return X86::COND_NO;
38073807
case X86::COND_NO: return X86::COND_O;
3808+
case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
3809+
case X86::COND_NP_OR_E: return X86::COND_P_AND_NE;
3810+
case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
3811+
case X86::COND_P_AND_NE: return X86::COND_NP_OR_E;
38083812
}
38093813
}
38103814

@@ -3998,9 +4002,9 @@ bool X86InstrInfo::AnalyzeBranchImpl(
39984002
MachineBasicBlock::iterator OldInst = I;
39994003

40004004
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
4001-
.addMBB(UnCondBrIter->getOperand(0).getMBB());
4005+
.addMBB(UnCondBrIter->getOperand(0).getMBB());
40024006
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
4003-
.addMBB(TargetBB);
4007+
.addMBB(TargetBB);
40044008

40054009
OldInst->eraseFromParent();
40064010
UnCondBrIter->eraseFromParent();
@@ -4024,11 +4028,6 @@ bool X86InstrInfo::AnalyzeBranchImpl(
40244028
assert(Cond.size() == 1);
40254029
assert(TBB);
40264030

4027-
// Only handle the case where all conditional branches branch to the same
4028-
// destination.
4029-
if (TBB != I->getOperand(0).getMBB())
4030-
return true;
4031-
40324031
// If the conditions are the same, we can leave them alone.
40334032
X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
40344033
if (OldBranchCode == BranchCode)
@@ -4037,17 +4036,40 @@ bool X86InstrInfo::AnalyzeBranchImpl(
40374036
// If they differ, see if they fit one of the known patterns. Theoretically,
40384037
// we could handle more patterns here, but we shouldn't expect to see them
40394038
// if instruction selection has done a reasonable job.
4040-
if ((OldBranchCode == X86::COND_NP &&
4041-
BranchCode == X86::COND_E) ||
4042-
(OldBranchCode == X86::COND_E &&
4043-
BranchCode == X86::COND_NP))
4039+
auto NewTBB = I->getOperand(0).getMBB();
4040+
if (TBB == NewTBB &&
4041+
((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_E) ||
4042+
(OldBranchCode == X86::COND_E && BranchCode == X86::COND_NP))) {
40444043
BranchCode = X86::COND_NP_OR_E;
4045-
else if ((OldBranchCode == X86::COND_P &&
4046-
BranchCode == X86::COND_NE) ||
4047-
(OldBranchCode == X86::COND_NE &&
4048-
BranchCode == X86::COND_P))
4044+
} else if (TBB == NewTBB &&
4045+
((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
4046+
(OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
40494047
BranchCode = X86::COND_NE_OR_P;
4050-
else
4048+
} else if ((OldBranchCode == X86::COND_NE && BranchCode == X86::COND_NP) ||
4049+
(OldBranchCode == X86::COND_P && BranchCode == X86::COND_E)) {
4050+
// X86::COND_P_AND_NE usually has two different branch destinations.
4051+
//
4052+
// JNP B1
4053+
// JNE B2
4054+
// B1: (fall-through)
4055+
// B2:
4056+
//
4057+
// Here this condition branches to B2 only if P && NE. It has another
4058+
// equivalent form:
4059+
//
4060+
// JE B1
4061+
// JP B2
4062+
// B1: (fall-through)
4063+
// B2:
4064+
//
4065+
// Similarly it branches to B2 only if NE && P. That is why this condition
4066+
// is named COND_P_AND_NE.
4067+
BranchCode = X86::COND_P_AND_NE;
4068+
} else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
4069+
(OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
4070+
// See comments above for X86::COND_P_AND_NE.
4071+
BranchCode = X86::COND_E_AND_NP;
4072+
} else
40514073
return true;
40524074

40534075
// Update the MachineOperand.
@@ -4156,6 +4178,13 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
41564178
return Count;
41574179
}
41584180

4181+
static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB) {
4182+
auto I = std::next(MBB->getIterator());
4183+
if (I == MBB->getParent()->end())
4184+
return nullptr;
4185+
return &*I;
4186+
}
4187+
41594188
unsigned
41604189
X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
41614190
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
@@ -4172,6 +4201,9 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
41724201
return 1;
41734202
}
41744203

4204+
// If FBB is null, it is implied to be a fall-through block.
4205+
bool FallThru = FBB == nullptr;
4206+
41754207
// Conditional branch.
41764208
unsigned Count = 0;
41774209
X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
@@ -4190,13 +4222,39 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
41904222
BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
41914223
++Count;
41924224
break;
4225+
case X86::COND_P_AND_NE:
4226+
// Use the next block of MBB as FBB if it is null.
4227+
if (FBB == nullptr) {
4228+
FBB = getFallThroughMBB(&MBB);
4229+
assert(FBB && "MBB cannot be the last block in function when the false "
4230+
"body is a fall-through.");
4231+
}
4232+
// Synthesize NEG_NP_OR_E with two branches.
4233+
BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(FBB);
4234+
++Count;
4235+
BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
4236+
++Count;
4237+
break;
4238+
case X86::COND_E_AND_NP:
4239+
// Use the next block of MBB as FBB if it is null.
4240+
if (FBB == nullptr) {
4241+
FBB = getFallThroughMBB(&MBB);
4242+
assert(FBB && "MBB cannot be the last block in function when the false "
4243+
"body is a fall-through.");
4244+
}
4245+
// Synthesize NEG_NE_OR_P with two branches.
4246+
BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
4247+
++Count;
4248+
BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
4249+
++Count;
4250+
break;
41934251
default: {
41944252
unsigned Opc = GetCondBranchFromCond(CC);
41954253
BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
41964254
++Count;
41974255
}
41984256
}
4199-
if (FBB) {
4257+
if (!FallThru) {
42004258
// Two-way Conditional branch. Insert the second branch.
42014259
BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
42024260
++Count;
@@ -6717,8 +6775,6 @@ bool X86InstrInfo::
67176775
ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
67186776
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
67196777
X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
6720-
if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
6721-
return true;
67226778
Cond[0].setImm(GetOppositeBranchCondition(CC));
67236779
return false;
67246780
}

Diff for: ‎llvm/lib/Target/X86/X86InstrInfo.h

+38-29
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,44 @@ namespace llvm {
2929
namespace X86 {
3030
// X86 specific condition code. These correspond to X86_*_COND in
3131
// X86InstrInfo.td. They must be kept in synch.
32-
enum CondCode {
33-
COND_A = 0,
34-
COND_AE = 1,
35-
COND_B = 2,
36-
COND_BE = 3,
37-
COND_E = 4,
38-
COND_G = 5,
39-
COND_GE = 6,
40-
COND_L = 7,
41-
COND_LE = 8,
42-
COND_NE = 9,
43-
COND_NO = 10,
44-
COND_NP = 11,
45-
COND_NS = 12,
46-
COND_O = 13,
47-
COND_P = 14,
48-
COND_S = 15,
49-
LAST_VALID_COND = COND_S,
50-
51-
// Artificial condition codes. These are used by AnalyzeBranch
52-
// to indicate a block terminated with two conditional branches to
53-
// the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
54-
// which can't be represented on x86 with a single condition. These
55-
// are never used in MachineInstrs.
56-
COND_NE_OR_P,
57-
COND_NP_OR_E,
58-
59-
COND_INVALID
60-
};
32+
enum CondCode {
33+
COND_A = 0,
34+
COND_AE = 1,
35+
COND_B = 2,
36+
COND_BE = 3,
37+
COND_E = 4,
38+
COND_G = 5,
39+
COND_GE = 6,
40+
COND_L = 7,
41+
COND_LE = 8,
42+
COND_NE = 9,
43+
COND_NO = 10,
44+
COND_NP = 11,
45+
COND_NS = 12,
46+
COND_O = 13,
47+
COND_P = 14,
48+
COND_S = 15,
49+
LAST_VALID_COND = COND_S,
50+
51+
// Artificial condition codes. These are used by AnalyzeBranch
52+
// to indicate a block terminated with two conditional branches to
53+
// the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
54+
// which can't be represented on x86 with a single condition. These
55+
// are never used in MachineInstrs.
56+
COND_NE_OR_P,
57+
COND_NP_OR_E,
58+
59+
// Artificial condition codes. These are used to represent the negation of
60+
// above two conditions. The only scenario we need these two conditions is
61+
// when we try to reverse above two conditions in order to remove redundant
62+
// unconditional jumps. Note that both true and false bodies need to be
63+
// avaiable in order to correctly synthesize instructions for them. These are
64+
// never used in MachineInstrs.
65+
COND_E_AND_NP, // negate of COND_NE_OR_P
66+
COND_P_AND_NE, // negate of COND_NP_OR_E
67+
68+
COND_INVALID
69+
};
6170

6271
// Turn condition code into conditional branch opcode.
6372
unsigned GetCondBranchFromCond(CondCode CC);

Diff for: ‎llvm/test/CodeGen/X86/block-placement.ll

+14-13
Original file line numberDiff line numberDiff line change
@@ -463,26 +463,23 @@ exit:
463463
}
464464

465465
define void @fpcmp_unanalyzable_branch(i1 %cond) {
466-
; This function's CFG contains an unanalyzable branch that is likely to be
467-
; split due to having a different high-probability predecessor.
466+
; This function's CFG contains an once-unanalyzable branch (une on floating
467+
; points). As now it becomes analyzable, we should get best layout in which each
468+
; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
469+
; fall-through.
468470
; CHECK: fpcmp_unanalyzable_branch
469471
; CHECK: %entry
472+
; CHECK: %entry.if.then_crit_edge
473+
; CHECK: %if.then
474+
; CHECK: %if.end
470475
; CHECK: %exit
471-
; CHECK-NOT: %if.then
472-
; CHECK-NOT: %if.end
473-
; CHECK-NOT: jne
474-
; CHECK-NOT: jnp
475476
; CHECK: jne
476477
; CHECK-NEXT: jnp
477-
; CHECK-NEXT: %if.then
478478

479479
entry:
480480
; Note that this branch must be strongly biased toward
481481
; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
482-
; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
483-
; chain which would violate the unanalyzable branch in 'exit', but we won't even
484-
; try this trick unless 'if.then' is believed to almost always be reached from
485-
; 'entry.if.then_crit_edge'.
482+
; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end'.
486483
br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
487484

488485
entry.if.then_crit_edge:
@@ -494,7 +491,7 @@ lor.lhs.false:
494491

495492
exit:
496493
%cmp.i = fcmp une double 0.000000e+00, undef
497-
br i1 %cmp.i, label %if.then, label %if.end
494+
br i1 %cmp.i, label %if.then, label %if.end, !prof !3
498495

499496
if.then:
500497
%0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
@@ -507,6 +504,7 @@ if.end:
507504
}
508505

509506
!1 = !{!"branch_weights", i32 1000, i32 1}
507+
!3 = !{!"branch_weights", i32 1, i32 1000}
510508

511509
declare i32 @f()
512510
declare i32 @g()
@@ -665,11 +663,14 @@ define void @unanalyzable_branch_to_best_succ(i1 %cond) {
665663
; Ensure that we can handle unanalyzable branches where the destination block
666664
; gets selected as the optimal successor to merge.
667665
;
666+
; This branch is now analyzable and hence the destination block becomes the
667+
; hotter one. The right order is entry->bar->exit->foo.
668+
;
668669
; CHECK: unanalyzable_branch_to_best_succ
669670
; CHECK: %entry
670-
; CHECK: %foo
671671
; CHECK: %bar
672672
; CHECK: %exit
673+
; CHECK: %foo
673674

674675
entry:
675676
; Bias this branch toward bar to ensure we form that chain.

Diff for: ‎llvm/test/CodeGen/X86/fast-isel-cmp-branch2.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ define i32 @fcmp_oeq(float %x, float %y) {
55
; CHECK-LABEL: fcmp_oeq
66
; CHECK: ucomiss %xmm1, %xmm0
77
; CHECK-NEXT: jne {{LBB.+_1}}
8-
; CHECK-NEXT: jnp {{LBB.+_2}}
8+
; CHECK-NEXT: jp {{LBB.+_1}}
99
%1 = fcmp oeq float %x, %y
1010
br i1 %1, label %bb1, label %bb2
1111
bb2:
@@ -162,8 +162,7 @@ define i32 @fcmp_une(float %x, float %y) {
162162
; CHECK-LABEL: fcmp_une
163163
; CHECK: ucomiss %xmm1, %xmm0
164164
; CHECK-NEXT: jne {{LBB.+_2}}
165-
; CHECK-NEXT: jp {{LBB.+_2}}
166-
; CHECK-NEXT: jmp {{LBB.+_1}}
165+
; CHECK-NEXT: jnp {{LBB.+_1}}
167166
%1 = fcmp une float %x, %y
168167
br i1 %1, label %bb1, label %bb2
169168
bb2:

Diff for: ‎llvm/test/CodeGen/X86/fast-isel-cmp-branch3.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ define i32 @fcmp_oeq2(float %x) {
1717
; CHECK: xorps %xmm1, %xmm1
1818
; CHECK-NEXT: ucomiss %xmm1, %xmm0
1919
; CHECK-NEXT: jne {{LBB.+_1}}
20-
; CHECK-NEXT: jnp {{LBB.+_2}}
20+
; CHECK-NEXT: jp {{LBB.+_1}}
2121
%1 = fcmp oeq float %x, 0.000000e+00
2222
br i1 %1, label %bb1, label %bb2
2323
bb2:
@@ -338,8 +338,7 @@ define i32 @fcmp_une2(float %x) {
338338
; CHECK: xorps %xmm1, %xmm1
339339
; CHECK-NEXT: ucomiss %xmm1, %xmm0
340340
; CHECK-NEXT: jne {{LBB.+_2}}
341-
; CHECK-NEXT: jp {{LBB.+_2}}
342-
; CHECK-NEXT: jmp {{LBB.+_1}}
341+
; CHECK-NEXT: jnp {{LBB.+_1}}
343342
%1 = fcmp une float %x, 0.000000e+00
344343
br i1 %1, label %bb1, label %bb2
345344
bb2:

Diff for: ‎llvm/test/CodeGen/X86/fp-une-cmp.ll

+30-3
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
; addsd ...
2020
; LBB0_2:
2121

22-
; CHECK: func
22+
define float @func1(float %x, float %y) nounwind readnone optsize ssp {
23+
; CHECK: func1
2324
; CHECK: jne [[LABEL:.*]]
2425
; CHECK-NEXT: jp [[LABEL]]
2526
; CHECK-NOT: jmp
26-
27-
define float @func(float %x, float %y) nounwind readnone optsize ssp {
27+
;
2828
entry:
2929
%0 = fpext float %x to double
3030
%1 = fpext float %y to double
@@ -41,3 +41,30 @@ bb2:
4141
%.0 = fptrunc double %.0.in to float
4242
ret float %.0
4343
}
44+
45+
define float @func2(float %x, float %y) nounwind readnone optsize ssp {
46+
; CHECK: func2
47+
; CHECK: jne [[LABEL:.*]]
48+
; CHECK-NEXT: jp [[LABEL]]
49+
; CHECK: %bb2
50+
; CHECK: %bb1
51+
; CHECK: jmp
52+
;
53+
entry:
54+
%0 = fpext float %x to double
55+
%1 = fpext float %y to double
56+
%2 = fmul double %0, %1
57+
%3 = fcmp une double %2, 0.000000e+00
58+
br i1 %3, label %bb1, label %bb2, !prof !1
59+
60+
bb1:
61+
%4 = fadd double %2, -1.000000e+00
62+
br label %bb2
63+
64+
bb2:
65+
%.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
66+
%.0 = fptrunc double %.0.in to float
67+
ret float %.0
68+
}
69+
70+
!1 = !{!"branch_weights", i32 1, i32 1000}

0 commit comments

Comments
 (0)
Please sign in to comment.