This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Target/X86/
-
Target/
-
X86/
-
X86CmovConversion.cpp
-
test/CodeGen/X86/
-
CodeGen/
-
X86/
-
cmov.ll
-
pr15981.ll
-
x86-cmov-converter.ll

Differential D36858

[x86] Teach the cmov converter to aggressively convert cmovs with memory operands into control flow.
ClosedPublic

Authored by chandlerc on Aug 17 2017, 7:02 PM.

Download Raw Diff

Details

Reviewers

aaboud
craig.topper

Commits

rG93a645525cf3: [x86] Teach the cmov converter to aggressively convert cmovs with memory…
rL311226: [x86] Teach the cmov converter to aggressively convert cmovs with memory

Summary

We have seen periodically performance problems with cmov where one
operand comes from memory. On modern x86 processors with strong branch
predictors and speculative execution, this tends to be much better done
with a branch than cmov. We routinely see cmov stalling while the load
is completed rather than continuing, and if there are subsequent
branches, they cannot be speculated in turn.

Also, in many (even simple) cases, macro fusion causes the control flow
version to be fewer uops.

Consider the IACA output for the initial sequence of code in a very hot
function in one of our internal benchmarks that motivates this, and notice the
micro-op reduction provided.
Before, SNB:

Throughput Analysis Report
--------------------------
Block Throughput: 2.20 Cycles       Throughput Bottleneck: Port1

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    |           | 1.0 |           |           |     |     | CP | mov rcx, rdi
|   0*   |           |     |           |           |     |     |    | xor edi, edi
|   2^   | 0.1       | 0.6 | 0.5   0.5 | 0.5   0.5 |     | 0.4 | CP | cmp byte ptr [rsi+0xf], 0xf
|   1    |           |     | 0.5   0.5 | 0.5   0.5 |     |     |    | mov rax, qword ptr [rsi]
|   3    | 1.8       | 0.6 |           |           |     | 0.6 | CP | cmovbe rax, rdi
|   2^   |           |     | 0.5   0.5 | 0.5   0.5 |     | 1.0 |    | cmp byte ptr [rcx+0xf], 0x10
|   0F   |           |     |           |           |     |     |    | jb 0xf
Total Num Of Uops: 9

After, SNB:

Throughput Analysis Report
--------------------------
Block Throughput: 2.00 Cycles       Throughput Bottleneck: Port5

| Num Of |              Ports pressure in cycles               |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |    |
---------------------------------------------------------------------
|   1    | 0.5       | 0.5 |           |           |     |     |    | mov rax, rdi
|   0*   |           |     |           |           |     |     |    | xor edi, edi
|   2^   | 0.5       | 0.5 | 1.0   1.0 |           |     |     |    | cmp byte ptr [rsi+0xf], 0xf
|   1    | 0.5       | 0.5 |           |           |     |     |    | mov ecx, 0x0
|   1    |           |     |           |           |     | 1.0 | CP | jnbe 0x39
|   2^   |           |     |           | 1.0   1.0 |     | 1.0 | CP | cmp byte ptr [rax+0xf], 0x10
|   0F   |           |     |           |           |     |     |    | jnb 0x3c
Total Num Of Uops: 7

The difference even manifests in a throughput cycle rate difference on Haswell.
Before, HSW:

Throughput Analysis Report
--------------------------
Block Throughput: 2.00 Cycles       Throughput Bottleneck: FrontEnd

| Num Of |                    Ports pressure in cycles                     |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |    |
---------------------------------------------------------------------------------
|   0*   |           |     |           |           |     |     |     |     |    | mov rcx, rdi
|   0*   |           |     |           |           |     |     |     |     |    | xor edi, edi
|   2^   |           |     | 0.5   0.5 | 0.5   0.5 |     | 1.0 |     |     |    | cmp byte ptr [rsi+0xf], 0xf
|   1    |           |     | 0.5   0.5 | 0.5   0.5 |     |     |     |     |    | mov rax, qword ptr [rsi]
|   3    | 1.0       | 1.0 |           |           |     |     | 1.0 |     |    | cmovbe rax, rdi
|   2^   | 0.5       |     | 0.5   0.5 | 0.5   0.5 |     |     | 0.5 |     |    | cmp byte ptr [rcx+0xf], 0x10
|   0F   |           |     |           |           |     |     |     |     |    | jb 0xf
Total Num Of Uops: 8

After, HSW:

Throughput Analysis Report
--------------------------
Block Throughput: 1.50 Cycles       Throughput Bottleneck: FrontEnd

| Num Of |                    Ports pressure in cycles                     |    |
|  Uops  |  0  - DV  |  1  |  2  -  D  |  3  -  D  |  4  |  5  |  6  |  7  |    |
---------------------------------------------------------------------------------
|   0*   |           |     |           |           |     |     |     |     |    | mov rax, rdi
|   0*   |           |     |           |           |     |     |     |     |    | xor edi, edi
|   2^   |           |     | 1.0   1.0 |           |     | 1.0 |     |     |    | cmp byte ptr [rsi+0xf], 0xf
|   1    |           | 1.0 |           |           |     |     |     |     |    | mov ecx, 0x0
|   1    |           |     |           |           |     |     | 1.0 |     |    | jnbe 0x39
|   2^   | 1.0       |     |           | 1.0   1.0 |     |     |     |     |    | cmp byte ptr [rax+0xf], 0x10
|   0F   |           |     |           |           |     |     |     |     |    | jnb 0x3c
Total Num Of Uops: 6

Note that this cannot be usefully restricted to inner loops. Much of the
hot code we see hitting this is not in an inner loop or not in a loop at
all. The optimization still remains effective and indeed critical for
some of our code.

I have run a suite of internal benchmarks with this change and saw no
significant regressions and few very significant improvements. I'm still
working on collecting data for SPEC and the LLVM test suite. I will
update when I have it.

I also am still working on dedicated testing of this functionality, but
I've built a very large amount of code with the patch and had no issues.

Depends on D36783.

Diff Detail

Repository: rL LLVM

Event Timeline

chandlerc created this revision.Aug 17 2017, 7:02 PM

Herald added subscribers: mcrosier, sanjoy. · View Herald TranscriptAug 17 2017, 7:02 PM

I've run this across SPEC CPU 2006 and the LLVM test suite on my Haswell system.

Only 11 programs had a different hash, only three from SPEC (433.milc, 445.gobmk, 483.xalancbmk).

Of those, only two had any significant changes in performance that a quick test showed. It is a bit hard to be 100% certain, the test suite is very noisy. Those two (PENNANT and consumer-typeset) had 4.7% and 5.4% difference theoretically.

However, when re-running each under perf stat I saw no significant cycle count or instruction count difference and the timings swung in both directions so this appears to be just noise.

So I think this is totally fine for the LLVM test suite and SPEC.

And our internal benchmarks (which have a bit more to carefully measure and minimize noise) show no significant regressions and at least a couple of significant improvements. There is some various shifting in each direction of course, but it seems net positive or neutral across Sandybridge, Haswell, and Skylake.

I think this covers a reasonable set of performance data for checking in, so code review would be very much appreciated. If there are other benchmarks or systems others want to run, by all means. Note that there is a flag to disable this behavior as well.

RKSimon added a subscriber: RKSimon.Aug 18 2017, 2:04 AM

Thanks Chandler for preparing the patch, the implementation looks elegant, however, it overlooked a case where the memory registers are a result of a previous CMOV instructions.
This is a small reproducer that result in bad MIR:

target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

define i32 @bar(i32* %a, i32* %b, i32 %n1, i32 %n2, i32 %d) #0 {
entry:
  %cmp = icmp sgt i32 %n1, %n2
  %s = select i1 %cmp, i32* %a, i32* %b
  %p = getelementptr inbounds i32, i32* %s, i64 1
  %load = load i32, i32* %p, align 4
  %res = select i1 %cmp, i32 %d, i32 %load
  
  ret i32 %res
}

attributes #0 = {"target-cpu"="skylake"}

In D36858#845378, @aaboud wrote:

Thanks Chandler for preparing the patch, the implementation looks elegant, however, it overlooked a case where the memory registers are a result of a previous CMOV instructions.

Doh, of course. I'll add a remapping step to generating unfolded load. Should be easy because we know that the load is always on one side so can just remap to one of the inputs.

This is a small reproducer that result in bad MIR:

Sweet!

target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

define i32 @bar(i32* %a, i32* %b, i32 %n1, i32 %n2, i32 %d) #0 {
entry:
  %cmp = icmp sgt i32 %n1, %n2
  %s = select i1 %cmp, i32* %a, i32* %b
  %p = getelementptr inbounds i32, i32* %s, i64 1
  %load = load i32, i32* %p, align 4
  %res = select i1 %cmp, i32 %d, i32 %load
  
  ret i32 %res
}

attributes #0 = {"target-cpu"="skylake"}

While I'm writing the fix, and since it is already late in your TZ -- any other concerns before I land this?

In D36858#845606, @chandlerc wrote:

While I'm writing the fix, and since it is already late in your TZ -- any other concerns before I land this?

No concerns, the direction looks just fine.
We only need to make sure the functionality is not broken.

Also, I just got the results from running internal benchmarks, and it seems that this optimization did not affect most of the workloads, and those that had code change, the performance was not affected as well!
So, I do not see a reason why not to continue with this improvement.

Update with more comprehensive testing and a bug fix mentioned in code review
(as well as the suggested test and an even more exciting test case).

Harbormaster completed remote builds in B9424: Diff 111779.Aug 18 2017, 6:02 PM

Ok, review comments addressed, tests added (after winning several battles with the register allocator to make interesting cmov groups).

LGTM with the one nit.

test/CodeGen/X86/x86-cmov-converter.ll
414 ↗	(On Diff #111779)	nit, space before "loads"

This revision is now accepted and ready to land.Aug 18 2017, 6:31 PM

Closed by commit rL311226: [x86] Teach the cmov converter to aggressively convert cmovs with memory (authored by chandlerc). · Explain WhyAug 18 2017, 10:04 PM

This revision was automatically updated to reflect the committed changes.

There is still an issue with this implementation, here another reproducer:

target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

define i32 @bar(i32* %a, i32* %b, i32* %c, i32 %n1, i32 %n2, i32 %d) #0 {
entry:
  %cmp = icmp sgt i32 %n1, %n2
  %s1 = select i1 %cmp, i32* %a, i32* %b
  %s = select i1 %cmp, i32* %c, i32* %s1
  %p = getelementptr inbounds i32, i32* %s, i64 1
  %load = load i32, i32* %p, align 4
  %res = select i1 %cmp, i32 %d, i32 %load
  
  ret i32 %res
}

attributes #0 = {"target-cpu"="skylake"}

In D36858#846610, @aaboud wrote:

There is still an issue with this implementation, here another reproducer:

target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

define i32 @bar(i32* %a, i32* %b, i32* %c, i32 %n1, i32 %n2, i32 %d) #0 {
entry:
  %cmp = icmp sgt i32 %n1, %n2
  %s1 = select i1 %cmp, i32* %a, i32* %b
  %s = select i1 %cmp, i32* %c, i32* %s1
  %p = getelementptr inbounds i32, i32* %s, i64 1
  %load = load i32, i32* %p, align 4
  %res = select i1 %cmp, i32 %d, i32 %load
  
  ret i32 %res
}

attributes #0 = {"target-cpu"="skylake"}

Good catch, fixed in r311267.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86CmovConversion.cpp

163 lines

test/

CodeGen/

X86/

cmov.ll

2 lines

pr15981.ll

5 lines

x86-cmov-converter.ll

137 lines

Diff 111797

llvm/trunk/lib/Target/X86/X86CmovConversion.cpp

Show First 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	EnableCmovConverter("x86-cmov-converter",
cl::desc("Enable the X86 cmov-to-branch optimization."),		cl::desc("Enable the X86 cmov-to-branch optimization."),
cl::init(true), cl::Hidden);		cl::init(true), cl::Hidden);

static cl::opt<unsigned>		static cl::opt<unsigned>
GainCycleThreshold("x86-cmov-converter-threshold",		GainCycleThreshold("x86-cmov-converter-threshold",
cl::desc("Minimum gain per loop (in cycles) threshold."),		cl::desc("Minimum gain per loop (in cycles) threshold."),
cl::init(4), cl::Hidden);		cl::init(4), cl::Hidden);

		static cl::opt<bool> ForceMemOperand(
		"x86-cmov-converter-force-mem-operand",
		cl::desc("Convert cmovs to branches whenever they have memory operands."),
		cl::init(true), cl::Hidden);

/// Converts X86 cmov instructions into branches when profitable.		/// Converts X86 cmov instructions into branches when profitable.
class X86CmovConverterPass : public MachineFunctionPass {		class X86CmovConverterPass : public MachineFunctionPass {
public:		public:
X86CmovConverterPass() : MachineFunctionPass(ID) {}		X86CmovConverterPass() : MachineFunctionPass(ID) {}
~X86CmovConverterPass() {}		~X86CmovConverterPass() {}

StringRef getPassName() const override { return "X86 cmov Conversion"; }		StringRef getPassName() const override { return "X86 cmov Conversion"; }
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;		void getAnalysisUsage(AnalysisUsage &AU) const override;

private:		private:
/// Pass identification, replacement for typeid.		/// Pass identification, replacement for typeid.
static char ID;		static char ID;

const MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
const TargetInstrInfo *TII;		const TargetInstrInfo *TII;
		const TargetRegisterInfo *TRI;
TargetSchedModel TSchedModel;		TargetSchedModel TSchedModel;

/// List of consecutive CMOV instructions.		/// List of consecutive CMOV instructions.
typedef SmallVector<MachineInstr *, 2> CmovGroup;		typedef SmallVector<MachineInstr *, 2> CmovGroup;
typedef SmallVector<CmovGroup, 2> CmovGroups;		typedef SmallVector<CmovGroup, 2> CmovGroups;

/// Collect all CMOV-group-candidates in \p CurrLoop and update \p		/// Collect all CMOV-group-candidates in \p CurrLoop and update \p
/// CmovInstGroups accordingly.		/// CmovInstGroups accordingly.
///		///
/// \param Blocks List of blocks to process.		/// \param Blocks List of blocks to process.
/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.		/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
/// \returns true iff it found any CMOV-group-candidate.		/// \returns true iff it found any CMOV-group-candidate.
bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,		bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
CmovGroups &CmovInstGroups);		CmovGroups &CmovInstGroups,
		bool IncludeLoads = false);

/// Check if it is profitable to transform each CMOV-group-candidates into		/// Check if it is profitable to transform each CMOV-group-candidates into
/// branch. Remove all groups that are not profitable from \p CmovInstGroups.		/// branch. Remove all groups that are not profitable from \p CmovInstGroups.
///		///
/// \param Blocks List of blocks to process.		/// \param Blocks List of blocks to process.
/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.		/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
/// \returns true iff any CMOV-group-candidate remain.		/// \returns true iff any CMOV-group-candidate remain.
bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,		bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
Show All 21 Lines	bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()		DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
<< "**********\n");		<< "**********\n");

bool Changed = false;		bool Changed = false;
MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();		MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
const TargetSubtargetInfo &STI = MF.getSubtarget();		const TargetSubtargetInfo &STI = MF.getSubtarget();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
TII = STI.getInstrInfo();		TII = STI.getInstrInfo();
		TRI = STI.getRegisterInfo();
TSchedModel.init(STI.getSchedModel(), &STI, TII);		TSchedModel.init(STI.getSchedModel(), &STI, TII);

		// Before we handle the more subtle cases of register-register CMOVs inside
		// of potentially hot loops, we want to quickly remove all CMOVs with
		// a memory operand. The CMOV will risk a stall waiting for the load to
		// complete that speculative execution behind a branch is better suited to
		// handle on modern x86 chips.
		if (ForceMemOperand) {
		CmovGroups AllCmovGroups;
		SmallVector<MachineBasicBlock *, 4> Blocks;
		for (auto &MBB : MF)
		Blocks.push_back(&MBB);
		if (collectCmovCandidates(Blocks, AllCmovGroups, /IncludeLoads/ true)) {
		for (auto &Group : AllCmovGroups) {
		// Skip any group that doesn't do at least one memory operand cmov.
		if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
		continue;

		// For CMOV groups which we can rewrite and which contain a memory load,
		// always rewrite them. On x86, a CMOV will dramatically amplify any
		// memory latency by blocking speculative execution.
		Changed = true;
		convertCmovInstsToBranches(Group);
		}
		}
		}

//===--------------------------------------------------------------------===//		//===--------------------------------------------------------------------===//
// Algorithm		// Register-operand Conversion Algorithm
// ---------		// ---------
// For each inner most loop		// For each inner most loop
// collectCmovCandidates() {		// collectCmovCandidates() {
// Find all CMOV-group-candidates.		// Find all CMOV-group-candidates.
// }		// }
//		//
// checkForProfitableCmovCandidates() {		// checkForProfitableCmovCandidates() {
// * Calculate both loop-depth and optimized-loop-depth.		// * Calculate both loop-depth and optimized-loop-depth.
Show All 32 Lines	for (MachineLoop *CurrLoop : Loops) {
if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),		if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
CmovInstGroups))		CmovInstGroups))
continue;		continue;

Changed = true;		Changed = true;
for (auto &Group : CmovInstGroups)		for (auto &Group : CmovInstGroups)
convertCmovInstsToBranches(Group);		convertCmovInstsToBranches(Group);
}		}

return Changed;		return Changed;
}		}

bool X86CmovConverterPass::collectCmovCandidates(		bool X86CmovConverterPass::collectCmovCandidates(
ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {		ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
		bool IncludeLoads) {
//===--------------------------------------------------------------------===//		//===--------------------------------------------------------------------===//
// Collect all CMOV-group-candidates and add them into CmovInstGroups.		// Collect all CMOV-group-candidates and add them into CmovInstGroups.
//		//
// CMOV-group:		// CMOV-group:
// CMOV instructions, in same MBB, that uses same EFLAGS def instruction.		// CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
//		//
// CMOV-group-candidate:		// CMOV-group-candidate:
// CMOV-group where all the CMOV instructions are		// CMOV-group where all the CMOV instructions are
Show All 9 Lines	bool X86CmovConverterPass::collectCmovCandidates(
//===--------------------------------------------------------------------===//		//===--------------------------------------------------------------------===//

// Current processed CMOV-Group.		// Current processed CMOV-Group.
CmovGroup Group;		CmovGroup Group;
for (auto *MBB : Blocks) {		for (auto *MBB : Blocks) {
Group.clear();		Group.clear();
// Condition code of first CMOV instruction current processed range and its		// Condition code of first CMOV instruction current processed range and its
// opposite condition code.		// opposite condition code.
X86::CondCode FirstCC, FirstOppCC;		X86::CondCode FirstCC, FirstOppCC, MemOpCC;
// Indicator of a non CMOVrr instruction in the current processed range.		// Indicator of a non CMOVrr instruction in the current processed range.
bool FoundNonCMOVInst = false;		bool FoundNonCMOVInst = false;
// Indicator for current processed CMOV-group if it should be skipped.		// Indicator for current processed CMOV-group if it should be skipped.
bool SkipGroup = false;		bool SkipGroup = false;

for (auto &I : *MBB) {		for (auto &I : *MBB) {
X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());		X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
// Check if we found a X86::CMOVrr instruction.		// Check if we found a X86::CMOVrr instruction.
if (CC != X86::COND_INVALID && !I.mayLoad()) {		if (CC != X86::COND_INVALID && (IncludeLoads \|\| !I.mayLoad())) {
if (Group.empty()) {		if (Group.empty()) {
// We found first CMOV in the range, reset flags.		// We found first CMOV in the range, reset flags.
FirstCC = CC;		FirstCC = CC;
FirstOppCC = X86::GetOppositeBranchCondition(CC);		FirstOppCC = X86::GetOppositeBranchCondition(CC);
		// Clear out the prior group's memory operand CC.
		MemOpCC = X86::COND_INVALID;
FoundNonCMOVInst = false;		FoundNonCMOVInst = false;
SkipGroup = false;		SkipGroup = false;
}		}
Group.push_back(&I);		Group.push_back(&I);
// Check if it is a non-consecutive CMOV instruction or it has different		// Check if it is a non-consecutive CMOV instruction or it has different
// condition code than FirstCC or FirstOppCC.		// condition code than FirstCC or FirstOppCC.
if (FoundNonCMOVInst \|\| (CC != FirstCC && CC != FirstOppCC))		if (FoundNonCMOVInst \|\| (CC != FirstCC && CC != FirstOppCC))
// Mark the SKipGroup indicator to skip current processed CMOV-Group.		// Mark the SKipGroup indicator to skip current processed CMOV-Group.
SkipGroup = true;		SkipGroup = true;
		if (I.mayLoad()) {
		if (MemOpCC == X86::COND_INVALID)
		// The first memory operand CMOV.
		MemOpCC = CC;
		else if (CC != MemOpCC)
		// Can't handle mixed conditions with memory operands.
		SkipGroup = true;
		}
continue;		continue;
}		}
// If Group is empty, keep looking for first CMOV in the range.		// If Group is empty, keep looking for first CMOV in the range.
if (Group.empty())		if (Group.empty())
continue;		continue;

// We found a non X86::CMOVrr instruction.		// We found a non X86::CMOVrr instruction.
FoundNonCMOVInst = true;		FoundNonCMOVInst = true;
▲ Show 20 Lines • Show All 280 Lines • ▼ Show 20 Lines	void X86CmovConverterPass::convertCmovInstsToBranches(
// %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch		// %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
// ; true-value with false-value		// ; true-value with false-value
// %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use		// %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
// ; previous Phi instruction result		// ; previous Phi instruction result

MachineInstr &MI = *Group.front();		MachineInstr &MI = *Group.front();
MachineInstr *LastCMOV = Group.back();		MachineInstr *LastCMOV = Group.back();
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();

X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));		X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);		X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
		// Potentially swap the condition codes so that any memory operand to a CMOV
		// is in the false position instead of the true position. We can invert
		// any non-memory operand CMOV instructions to cope with this and we ensure
		// memory operand CMOVs are only included with a single condition code.
		if (llvm::any_of(Group, [&](MachineInstr *I) {
		return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
		}))
		std::swap(CC, OppCC);

MachineBasicBlock *MBB = MI.getParent();		MachineBasicBlock *MBB = MI.getParent();
MachineFunction::iterator It = ++MBB->getIterator();		MachineFunction::iterator It = ++MBB->getIterator();
MachineFunction *F = MBB->getParent();		MachineFunction *F = MBB->getParent();
const BasicBlock *BB = MBB->getBasicBlock();		const BasicBlock *BB = MBB->getBasicBlock();

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);		MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);		MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
F->insert(It, FalseMBB);		F->insert(It, FalseMBB);
Show All 20 Lines	void X86CmovConverterPass::convertCmovInstsToBranches(

// Add the sink block to the false block successors.		// Add the sink block to the false block successors.
FalseMBB->addSuccessor(SinkMBB);		FalseMBB->addSuccessor(SinkMBB);

MachineInstrBuilder MIB;		MachineInstrBuilder MIB;
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);		MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =		MachineBasicBlock::iterator MIItEnd =
std::next(MachineBasicBlock::iterator(LastCMOV));		std::next(MachineBasicBlock::iterator(LastCMOV));
		MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();		MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

		// First we need to insert an explicit load on the false path for any memory
		// operand. We also need to potentially do register rewriting here, but it is
		// simpler as the memory operands are always on the false path so we can
		// simply take that input, whatever it is.
		DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
		for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
		auto &MI = *MIIt++;
		// Skip any CMOVs in this group which don't load from memory.
		if (!MI.mayLoad()) {
		// Remember the false-side register input.
		FalseBBRegRewriteTable[MI.getOperand(0).getReg()] =
		MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
		.getReg();
		continue;
		}

		// The condition must be the opposite of the one we've decided to branch
		// on as the branch will go around the load and the load should happen
		// when the CMOV condition is false.
		assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
		"Can only handle memory-operand cmov instructions with a condition "
		"opposite to the selected branch direction.");

		// The goal is to rewrite the cmov from:
		//
		// MBB:
		// %A = CMOVcc %B (tied), (mem)
		//
		// to
		//
		// MBB:
		// %A = CMOVcc %B (tied), %C
		// FalseMBB:
		// %C = MOV (mem)
		//
		// Which will allow the next loop to rewrite the CMOV in terms of a PHI:
		//
		// MBB:
		// JMP!cc SinkMBB
		// FalseMBB:
		// %C = MOV (mem)
		// SinkMBB:
		// %A = PHI [ %C, FalseMBB ], [ %B, MBB]

		// Get a fresh register to use as the destination of the MOV.
		const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
		unsigned TmpReg = MRI->createVirtualRegister(RC);

		SmallVector<MachineInstr *, 4> NewMIs;
		bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
		/UnfoldLoad/ true,
		/UnfoldStore/ false, NewMIs);
		(void)Unfolded;
		assert(Unfolded && "Should never fail to unfold a loading cmov!");

		// Move the new CMOV to just before the old one and reset any impacted
		// iterator.
		auto *NewCMOV = NewMIs.pop_back_val();
		assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
		"Last new instruction isn't the expected CMOV!");
		DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
		MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
		if (&*MIItBegin == &MI)
		MIItBegin = MachineBasicBlock::iterator(NewCMOV);

		// Sink whatever instructions were needed to produce the unfolded operand
		// into the false block.
		for (auto *NewMI : NewMIs) {
		DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
		FalseMBB->insert(FalseInsertionPoint, NewMI);
		// Re-map any operands that are from other cmovs to the inputs for this block.
		for (auto &MOp : NewMI->uses()) {
		if (!MOp.isReg())
		continue;
		auto It = FalseBBRegRewriteTable.find(MOp.getReg());
		if (It == FalseBBRegRewriteTable.end())
		continue;

		MOp.setReg(It->second);
		// This might have been a kill when it referenced the cmov result, but
		// it won't necessarily be once rewritten.
		// FIXME: We could potentially improve this by tracking whether the
		// operand to the cmov was also a kill, and then skipping the PHI node
		// construction below.
		MOp.setIsKill(false);
		}
		}
		MBB->erase(MachineBasicBlock::iterator(MI),
		std::next(MachineBasicBlock::iterator(MI)));

		// Add this PHI to the rewrite table.
		FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
		}

// As we are creating the PHIs, we have to be careful if there is more than		// As we are creating the PHIs, we have to be careful if there is more than
// one. Later CMOVs may reference the results of earlier CMOVs, but later		// one. Later CMOVs may reference the results of earlier CMOVs, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.		// PHIs have to reference the individual true/false inputs from earlier PHIs.
// That also means that PHI construction must work forward from earlier to		// That also means that PHI construction must work forward from earlier to
// later, and that the code must maintain a mapping from earlier PHI's		// later, and that the code must maintain a mapping from earlier PHI's
// destination registers, and the registers that went into the PHI.		// destination registers, and the registers that went into the PHI.
DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;		DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/cmov.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch \| FileCheck %s			; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false \| FileCheck %s
	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"

	define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {			define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
	; CHECK-LABEL: test1:			; CHECK-LABEL: test1:
	; CHECK: # BB#0: # %entry			; CHECK: # BB#0: # %entry
	; CHECK-NEXT: btl %esi, %edi			; CHECK-NEXT: btl %esi, %edi
	; CHECK-NEXT: movl $12, %eax			; CHECK-NEXT: movl $12, %eax
	; CHECK-NEXT: cmovael (%rcx), %eax			; CHECK-NEXT: cmovael (%rcx), %eax
	▲ Show 20 Lines • Show All 210 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/pr15981.ll

	Show All 38 Lines
	; X86-NEXT: .LBB1_2:			; X86-NEXT: .LBB1_2:
	; X86-NEXT: movl %eax, c			; X86-NEXT: movl %eax, c
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: fn2:			; X64-LABEL: fn2:
	; X64: # BB#0:			; X64: # BB#0:
	; X64-NEXT: xorl %eax, %eax			; X64-NEXT: xorl %eax, %eax
	; X64-NEXT: decl {{.*}}(%rip)			; X64-NEXT: decl {{.*}}(%rip)
	; X64-NEXT: cmovnel {{.*}}(%rip), %eax			; X64-NEXT: je .LBB1_2
				; X64-NEXT: # BB#1:
				; X64-NEXT: movl {{.*}}(%rip), %eax
				; X64-NEXT: .LBB1_2:
	; X64-NEXT: movl %eax, {{.*}}(%rip)			; X64-NEXT: movl %eax, {{.*}}(%rip)
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load volatile i32, i32* @b, align 4			%1 = load volatile i32, i32* @b, align 4
	%2 = load i32, i32* @a, align 4			%2 = load i32, i32* @a, align 4
	%3 = add nsw i32 %2, -1			%3 = add nsw i32 %2, -1
	store i32 %3, i32* @a, align 4			store i32 %3, i32* @a, align 4
	%4 = icmp ne i32 %3, 0			%4 = icmp ne i32 %3, 0
	%5 = select i1 %4, i32 %1, i32 0			%5 = select i1 %4, i32 %1, i32 0
	store i32 %5, i32* @c, align 4			store i32 %5, i32* @c, align 4
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll

Show First 20 Lines • Show All 324 Lines • ▼ Show 20 Lines	while.body: ; preds = %entry, %while.body
%i_inc = add i32 %i, 1		%i_inc = add i32 %i, 1
%cmp = icmp ugt i32 %i_inc, %n		%cmp = icmp ugt i32 %i_inc, %n
br i1 %cmp, label %while.body, label %while.end		br i1 %cmp, label %while.body, label %while.end

while.end: ; preds = %while.body, %entry		while.end: ; preds = %while.body, %entry
ret void		ret void
}		}

		; Test that we always will convert a cmov with a memory operand into a branch,
		; even outside of a loop.
		define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
		; CHECK-LABEL: test_cmov_memoperand:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%load = load i32, i32* %y
		%z = select i1 %cond, i32 %x, i32 %load
		; CHECK-NOT: cmov
		; CHECK: ja [[FALSE_BB:.*]]
		; CHECK: movl (%r{{..}}), %[[R:.*]]
		; CHECK: [[FALSE_BB]]:
		; CHECK: movl %[[R]], %
		ret i32 %z
		}

		; Test that we can convert a group of cmovs where only one has a memory
		; operand.
		define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
		; CHECK-LABEL: test_cmov_memoperand_in_group:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%y = load i32, i32* %y.ptr
		%z1 = select i1 %cond, i32 %x, i32 %a
		%z2 = select i1 %cond, i32 %x, i32 %y
		%z3 = select i1 %cond, i32 %x, i32 %b
		; CHECK-NOT: cmov
		; CHECK: ja [[FALSE_BB:.*]]
		; CHECK-DAG: movl %{{.}}, %[[R1:.]]
		; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
		; CHECK-DAG: movl %{{.}} %[[R3:.]]
		; CHECK: [[FALSE_BB]]:
		; CHECK: addl
		; CHECK-DAG: %[[R1]]
		; CHECK-DAG: ,
		; CHECK-DAG: %[[R3]]
		; CHECK-DAG: addl
		; CHECK-DAG: %[[R2]]
		; CHECK-DAG: ,
		; CHECK-DAG: %[[R3]]
		; CHECK: movl %[[R3]], %eax
		; CHECK: retq
		%s1 = add i32 %z1, %z2
		%s2 = add i32 %s1, %z3
		ret i32 %s2
		}

		; Same as before but with operands reversed in the select with a load.
		define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
		; CHECK-LABEL: test_cmov_memoperand_in_group2:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%y = load i32, i32* %y.ptr
		%z2 = select i1 %cond, i32 %a, i32 %x
		%z1 = select i1 %cond, i32 %y, i32 %x
		%z3 = select i1 %cond, i32 %b, i32 %x
		; CHECK-NOT: cmov
		; CHECK: jbe [[FALSE_BB:.*]]
		; CHECK-DAG: movl %{{.}}, %[[R1:.]]
		; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
		; CHECK-DAG: movl %{{.}} %[[R3:.]]
		; CHECK: [[FALSE_BB]]:
		; CHECK: addl
		; CHECK-DAG: %[[R1]]
		; CHECK-DAG: ,
		; CHECK-DAG: %[[R3]]
		; CHECK-DAG: addl
		; CHECK-DAG: %[[R2]]
		; CHECK-DAG: ,
		; CHECK-DAG: %[[R3]]
		; CHECK: movl %[[R3]], %eax
		; CHECK: retq
		%s1 = add i32 %z1, %z2
		%s2 = add i32 %s1, %z3
		ret i32 %s2
		}

		; Test that we don't convert a group of cmovs with conflicting directions of
		; loads.
		define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
		; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%y1 = load i32, i32* %y1.ptr
		%y2 = load i32, i32* %y2.ptr
		%z1 = select i1 %cond, i32 %x, i32 %y1
		%z2 = select i1 %cond, i32 %y2, i32 %x
		; CHECK: cmoval
		; CHECK: cmoval
		%s1 = add i32 %z1, %z2
		ret i32 %s1
		}

		; Test that we can convert a group of cmovs where only one has a memory
		; operand and where that memory operand's registers come from a prior cmov in the group.
		define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
		; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%p = select i1 %cond, i32* %x, i32* %y
		%load = load i32, i32* %p
		%z = select i1 %cond, i32 %a, i32 %load
		; CHECK-NOT: cmov
		; CHECK: ja [[FALSE_BB:.*]]
		; CHECK: movl (%r{{..}}), %[[R:.*]]
		; CHECK: [[FALSE_BB]]:
		; CHECK: movl %[[R]], %eax
		; CHECK: retq
		ret i32 %z
		}

		; Test that we can convert a group of two cmovs with memory operands where one
		; uses the result of the other as part of the address.
		define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
		; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
		entry:
		%cond = icmp ugt i32 %a, %b
		; CHECK: cmpl
		%load1 = load i32, i32* %y
		%p = select i1 %cond, i32* %x, i32* %load1
		%load2 = load i32, i32* %p
		%z = select i1 %cond, i32 %a, i32 %load2
		; CHECK-NOT: cmov
		; CHECK: ja [[FALSE_BB:.*]]
		; CHECK: movq (%r{{..}}), %[[R1:.*]]
		; CHECK: movl (%[[R1]]), %[[R2:.*]]
		; CHECK: [[FALSE_BB]]:
		; CHECK: movl %[[R2]], %eax
		; CHECK: retq
		ret i32 %z
		}

attributes #0 = {"target-cpu"="x86-64"}		attributes #0 = {"target-cpu"="x86-64"}