Index: llvm/trunk/lib/Target/X86/X86CmovConversion.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86CmovConversion.cpp +++ llvm/trunk/lib/Target/X86/X86CmovConversion.cpp @@ -72,6 +72,11 @@ cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden); +static cl::opt ForceMemOperand( + "x86-cmov-converter-force-mem-operand", + cl::desc("Convert cmovs to branches whenever they have memory operands."), + cl::init(true), cl::Hidden); + /// Converts X86 cmov instructions into branches when profitable. class X86CmovConverterPass : public MachineFunctionPass { public: @@ -86,8 +91,9 @@ /// Pass identification, replacement for typeid. static char ID; - const MachineRegisterInfo *MRI; + MachineRegisterInfo *MRI; const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; TargetSchedModel TSchedModel; /// List of consecutive CMOV instructions. @@ -101,7 +107,8 @@ /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. /// \returns true iff it found any CMOV-group-candidate. bool collectCmovCandidates(ArrayRef Blocks, - CmovGroups &CmovInstGroups); + CmovGroups &CmovInstGroups, + bool IncludeLoads = false); /// Check if it is profitable to transform each CMOV-group-candidates into /// branch. Remove all groups that are not profitable from \p CmovInstGroups. @@ -139,10 +146,36 @@ const TargetSubtargetInfo &STI = MF.getSubtarget(); MRI = &MF.getRegInfo(); TII = STI.getInstrInfo(); + TRI = STI.getRegisterInfo(); TSchedModel.init(STI.getSchedModel(), &STI, TII); + // Before we handle the more subtle cases of register-register CMOVs inside + // of potentially hot loops, we want to quickly remove all CMOVs with + // a memory operand. The CMOV will risk a stall waiting for the load to + // complete that speculative execution behind a branch is better suited to + // handle on modern x86 chips. + if (ForceMemOperand) { + CmovGroups AllCmovGroups; + SmallVector Blocks; + for (auto &MBB : MF) + Blocks.push_back(&MBB); + if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { + for (auto &Group : AllCmovGroups) { + // Skip any group that doesn't do at least one memory operand cmov. + if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) + continue; + + // For CMOV groups which we can rewrite and which contain a memory load, + // always rewrite them. On x86, a CMOV will dramatically amplify any + // memory latency by blocking speculative execution. + Changed = true; + convertCmovInstsToBranches(Group); + } + } + } + //===--------------------------------------------------------------------===// - // Algorithm + // Register-operand Conversion Algorithm // --------- // For each inner most loop // collectCmovCandidates() { @@ -191,11 +224,13 @@ for (auto &Group : CmovInstGroups) convertCmovInstsToBranches(Group); } + return Changed; } bool X86CmovConverterPass::collectCmovCandidates( - ArrayRef Blocks, CmovGroups &CmovInstGroups) { + ArrayRef Blocks, CmovGroups &CmovInstGroups, + bool IncludeLoads) { //===--------------------------------------------------------------------===// // Collect all CMOV-group-candidates and add them into CmovInstGroups. // @@ -221,7 +256,7 @@ Group.clear(); // Condition code of first CMOV instruction current processed range and its // opposite condition code. - X86::CondCode FirstCC, FirstOppCC; + X86::CondCode FirstCC, FirstOppCC, MemOpCC; // Indicator of a non CMOVrr instruction in the current processed range. bool FoundNonCMOVInst = false; // Indicator for current processed CMOV-group if it should be skipped. @@ -230,11 +265,13 @@ for (auto &I : *MBB) { X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); // Check if we found a X86::CMOVrr instruction. - if (CC != X86::COND_INVALID && !I.mayLoad()) { + if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) { if (Group.empty()) { // We found first CMOV in the range, reset flags. FirstCC = CC; FirstOppCC = X86::GetOppositeBranchCondition(CC); + // Clear out the prior group's memory operand CC. + MemOpCC = X86::COND_INVALID; FoundNonCMOVInst = false; SkipGroup = false; } @@ -244,6 +281,14 @@ if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) // Mark the SKipGroup indicator to skip current processed CMOV-Group. SkipGroup = true; + if (I.mayLoad()) { + if (MemOpCC == X86::COND_INVALID) + // The first memory operand CMOV. + MemOpCC = CC; + else if (CC != MemOpCC) + // Can't handle mixed conditions with memory operands. + SkipGroup = true; + } continue; } // If Group is empty, keep looking for first CMOV in the range. @@ -540,8 +585,18 @@ MachineInstr &MI = *Group.front(); MachineInstr *LastCMOV = Group.back(); DebugLoc DL = MI.getDebugLoc(); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + // Potentially swap the condition codes so that any memory operand to a CMOV + // is in the *false* position instead of the *true* position. We can invert + // any non-memory operand CMOV instructions to cope with this and we ensure + // memory operand CMOVs are only included with a single condition code. + if (llvm::any_of(Group, [&](MachineInstr *I) { + return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC; + })) + std::swap(CC, OppCC); + MachineBasicBlock *MBB = MI.getParent(); MachineFunction::iterator It = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); @@ -578,7 +633,103 @@ MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin(); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); + + // First we need to insert an explicit load on the false path for any memory + // operand. We also need to potentially do register rewriting here, but it is + // simpler as the memory operands are always on the false path so we can + // simply take that input, whatever it is. + DenseMap FalseBBRegRewriteTable; + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) { + auto &MI = *MIIt++; + // Skip any CMOVs in this group which don't load from memory. + if (!MI.mayLoad()) { + // Remember the false-side register input. + FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = + MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2) + .getReg(); + continue; + } + + // The condition must be the *opposite* of the one we've decided to branch + // on as the branch will go *around* the load and the load should happen + // when the CMOV condition is false. + assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC && + "Can only handle memory-operand cmov instructions with a condition " + "opposite to the selected branch direction."); + + // The goal is to rewrite the cmov from: + // + // MBB: + // %A = CMOVcc %B (tied), (mem) + // + // to + // + // MBB: + // %A = CMOVcc %B (tied), %C + // FalseMBB: + // %C = MOV (mem) + // + // Which will allow the next loop to rewrite the CMOV in terms of a PHI: + // + // MBB: + // JMP!cc SinkMBB + // FalseMBB: + // %C = MOV (mem) + // SinkMBB: + // %A = PHI [ %C, FalseMBB ], [ %B, MBB] + + // Get a fresh register to use as the destination of the MOV. + const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); + unsigned TmpReg = MRI->createVirtualRegister(RC); + + SmallVector NewMIs; + bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, + /*UnfoldLoad*/ true, + /*UnfoldStore*/ false, NewMIs); + (void)Unfolded; + assert(Unfolded && "Should never fail to unfold a loading cmov!"); + + // Move the new CMOV to just before the old one and reset any impacted + // iterator. + auto *NewCMOV = NewMIs.pop_back_val(); + assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC && + "Last new instruction isn't the expected CMOV!"); + DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); + MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); + if (&*MIItBegin == &MI) + MIItBegin = MachineBasicBlock::iterator(NewCMOV); + + // Sink whatever instructions were needed to produce the unfolded operand + // into the false block. + for (auto *NewMI : NewMIs) { + DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump()); + FalseMBB->insert(FalseInsertionPoint, NewMI); + // Re-map any operands that are from other cmovs to the inputs for this block. + for (auto &MOp : NewMI->uses()) { + if (!MOp.isReg()) + continue; + auto It = FalseBBRegRewriteTable.find(MOp.getReg()); + if (It == FalseBBRegRewriteTable.end()) + continue; + + MOp.setReg(It->second); + // This might have been a kill when it referenced the cmov result, but + // it won't necessarily be once rewritten. + // FIXME: We could potentially improve this by tracking whether the + // operand to the cmov was also a kill, and then skipping the PHI node + // construction below. + MOp.setIsKill(false); + } + } + MBB->erase(MachineBasicBlock::iterator(MI), + std::next(MachineBasicBlock::iterator(MI))); + + // Add this PHI to the rewrite table. + FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg; + } + // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. Index: llvm/trunk/test/CodeGen/X86/cmov.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/cmov.ll +++ llvm/trunk/test/CodeGen/X86/cmov.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone { Index: llvm/trunk/test/CodeGen/X86/pr15981.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/pr15981.ll +++ llvm/trunk/test/CodeGen/X86/pr15981.ll @@ -44,7 +44,10 @@ ; X64: # BB#0: ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: decl {{.*}}(%rip) -; X64-NEXT: cmovnel {{.*}}(%rip), %eax +; X64-NEXT: je .LBB1_2 +; X64-NEXT: # BB#1: +; X64-NEXT: movl {{.*}}(%rip), %eax +; X64-NEXT: .LBB1_2: ; X64-NEXT: movl %eax, {{.*}}(%rip) ; X64-NEXT: retq %1 = load volatile i32, i32* @b, align 4 Index: llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll +++ llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll @@ -330,4 +330,141 @@ ret void } +; Test that we always will convert a cmov with a memory operand into a branch, +; even outside of a loop. +define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 { +; CHECK-LABEL: test_cmov_memoperand: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %load = load i32, i32* %y + %z = select i1 %cond, i32 %x, i32 %load +; CHECK-NOT: cmov +; CHECK: ja [[FALSE_BB:.*]] +; CHECK: movl (%r{{..}}), %[[R:.*]] +; CHECK: [[FALSE_BB]]: +; CHECK: movl %[[R]], % + ret i32 %z +} + +; Test that we can convert a group of cmovs where only one has a memory +; operand. +define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 { +; CHECK-LABEL: test_cmov_memoperand_in_group: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %y = load i32, i32* %y.ptr + %z1 = select i1 %cond, i32 %x, i32 %a + %z2 = select i1 %cond, i32 %x, i32 %y + %z3 = select i1 %cond, i32 %x, i32 %b +; CHECK-NOT: cmov +; CHECK: ja [[FALSE_BB:.*]] +; CHECK-DAG: movl %{{.*}}, %[[R1:.*]] +; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]] +; CHECK-DAG: movl %{{.*}} %[[R3:.*]] +; CHECK: [[FALSE_BB]]: +; CHECK: addl +; CHECK-DAG: %[[R1]] +; CHECK-DAG: , +; CHECK-DAG: %[[R3]] +; CHECK-DAG: addl +; CHECK-DAG: %[[R2]] +; CHECK-DAG: , +; CHECK-DAG: %[[R3]] +; CHECK: movl %[[R3]], %eax +; CHECK: retq + %s1 = add i32 %z1, %z2 + %s2 = add i32 %s1, %z3 + ret i32 %s2 +} + +; Same as before but with operands reversed in the select with a load. +define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 { +; CHECK-LABEL: test_cmov_memoperand_in_group2: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %y = load i32, i32* %y.ptr + %z2 = select i1 %cond, i32 %a, i32 %x + %z1 = select i1 %cond, i32 %y, i32 %x + %z3 = select i1 %cond, i32 %b, i32 %x +; CHECK-NOT: cmov +; CHECK: jbe [[FALSE_BB:.*]] +; CHECK-DAG: movl %{{.*}}, %[[R1:.*]] +; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]] +; CHECK-DAG: movl %{{.*}} %[[R3:.*]] +; CHECK: [[FALSE_BB]]: +; CHECK: addl +; CHECK-DAG: %[[R1]] +; CHECK-DAG: , +; CHECK-DAG: %[[R3]] +; CHECK-DAG: addl +; CHECK-DAG: %[[R2]] +; CHECK-DAG: , +; CHECK-DAG: %[[R3]] +; CHECK: movl %[[R3]], %eax +; CHECK: retq + %s1 = add i32 %z1, %z2 + %s2 = add i32 %s1, %z3 + ret i32 %s2 +} + +; Test that we don't convert a group of cmovs with conflicting directions of +; loads. +define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 { +; CHECK-LABEL: test_cmov_memoperand_conflicting_dir: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %y1 = load i32, i32* %y1.ptr + %y2 = load i32, i32* %y2.ptr + %z1 = select i1 %cond, i32 %x, i32 %y1 + %z2 = select i1 %cond, i32 %y2, i32 %x +; CHECK: cmoval +; CHECK: cmoval + %s1 = add i32 %z1, %z2 + ret i32 %s1 +} + +; Test that we can convert a group of cmovs where only one has a memory +; operand and where that memory operand's registers come from a prior cmov in the group. +define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 { +; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %p = select i1 %cond, i32* %x, i32* %y + %load = load i32, i32* %p + %z = select i1 %cond, i32 %a, i32 %load +; CHECK-NOT: cmov +; CHECK: ja [[FALSE_BB:.*]] +; CHECK: movl (%r{{..}}), %[[R:.*]] +; CHECK: [[FALSE_BB]]: +; CHECK: movl %[[R]], %eax +; CHECK: retq + ret i32 %z +} + +; Test that we can convert a group of two cmovs with memory operands where one +; uses the result of the other as part of the address. +define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 { +; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2: +entry: + %cond = icmp ugt i32 %a, %b +; CHECK: cmpl + %load1 = load i32*, i32** %y + %p = select i1 %cond, i32* %x, i32* %load1 + %load2 = load i32, i32* %p + %z = select i1 %cond, i32 %a, i32 %load2 +; CHECK-NOT: cmov +; CHECK: ja [[FALSE_BB:.*]] +; CHECK: movq (%r{{..}}), %[[R1:.*]] +; CHECK: movl (%[[R1]]), %[[R2:.*]] +; CHECK: [[FALSE_BB]]: +; CHECK: movl %[[R2]], %eax +; CHECK: retq + ret i32 %z +} + attributes #0 = {"target-cpu"="x86-64"}