Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -26,6 +26,16 @@ class X86RegisterInfo; class X86Subtarget; + namespace MachineCombinerPattern { + enum MC_PATTERN : int { + MC_NONE = 0, + MC_REASSOC_1 = 1, + MC_REASSOC_2 = 2, + MC_REASSOC_3 = 3, + MC_REASSOC_4 = 4, + }; + } // end namespace MachineCombinerPattern + namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -426,6 +436,26 @@ const MachineInstr *UseMI, unsigned UseIdx) const override; + + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// output in the array. + bool hasPattern( + MachineInstr &Root, + SmallVectorImpl &P) const override; + + /// When hasPattern() finds a pattern, this function generates the + /// instructions that could replace the original code sequence. + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -6223,6 +6223,257 @@ return isHighLatencyDef(DefMI->getOpcode()); } +// TODO: There is nothing x86-specific here except the instruction type. +// This logic could be hoisted into the machine combiner pass itself. +bool X86InstrInfo::hasPattern(MachineInstr &Root, + SmallVectorImpl &Pattern) const { + unsigned Opc = Root.getOpcode(); + MachineBasicBlock *MBB = Root.getParent(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineFunction &MF = *(MBB->getParent()); + + // TODO: Ideally, we would use instruction-level fast-math-flags instead of a + // global parameter. + if (!MF.getTarget().Options.UnsafeFPMath) + return false; + + // TODO: There are many more associative instruction types to match: + // 1. Other forms of scalar FP add (non-AVX) + // 2. Other data types (double, integer, vectors) + // 3. Other math / logic operations (mul, and, or) + unsigned AssocOpcode = X86::VADDSSrr; + + if (Opc != AssocOpcode) + return false; + + MachineOperand Op1 = Root.getOperand(1); + MachineOperand Op2 = Root.getOperand(2); + + // We need virtual register definitions. + MachineInstr *MI1 = nullptr; + MachineInstr *MI2 = nullptr; + if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) + MI1 = MRI.getUniqueVRegDef(Op1.getReg()); + if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) + MI2 = MRI.getUniqueVRegDef(Op2.getReg()); + + // And they need to be in the trace (otherwise, they won't have a depth). + if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB) + return false; + + bool CommuteRoot = false; + if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) { + std::swap(MI1, MI2); + CommuteRoot = true; + } + + // The instruction must only be used by the other instruction that we + // reassociate with. + if (!MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) + return false; + + // We must match a simple chain of dependent ops. + if (MI1->getOpcode() != AssocOpcode) + return false; + + // Avoid reassociating operands when it won't provide any benefit. If both + // operands are produced by instructions of this type, we may already + // have the optimal sequence. + if (MI2->getOpcode() == AssocOpcode) + return false; + + // Look for a third dependent instruction in the sequence. + MachineOperand Op11 = MI1->getOperand(1); + MachineOperand Op12 = MI1->getOperand(2); + + // We need virtual register definitions. + MachineInstr *MI11 = nullptr; + MachineInstr *MI12 = nullptr; + if (Op11.isReg() && TargetRegisterInfo::isVirtualRegister(Op11.getReg())) + MI11 = MRI.getUniqueVRegDef(Op11.getReg()); + if (Op12.isReg() && TargetRegisterInfo::isVirtualRegister(Op12.getReg())) + MI12 = MRI.getUniqueVRegDef(Op12.getReg()); + + // And they need to be in the trace (otherwise, they won't have a depth). + if (!MI11 || !MI12 || MI11->getParent() != MBB || MI12->getParent() != MBB) + return false; + + bool CommutePrev = false; + if (MI11->getOpcode() != AssocOpcode && MI12->getOpcode() == AssocOpcode) { + std::swap(MI11, MI12); + CommutePrev = true; + } + + // TODO: This restriction can be relaxed. The third instruction in the + // sequence could be any other instruction that adds to the height of this + // trace. + if (MI11->getOpcode() != AssocOpcode) + return false; + + // Avoid reassociating operands when it won't provide any benefit. If the + // earlier instruction's operands are both produced by this type of + // instruction, then we might have just optimized it. + if (MI12->getOpcode() == AssocOpcode) + return false; + + // We found a sequence of instructions that may be suitable for a + // reassociation of operands to increase ILP. Select the appropriate codegen + // pattern based on the commutation of operands found here. + if (CommutePrev) { + if (CommuteRoot) + Pattern.push_back(MachineCombinerPattern::MC_REASSOC_4); + else + Pattern.push_back(MachineCombinerPattern::MC_REASSOC_3); + } else { + if (CommuteRoot) + Pattern.push_back(MachineCombinerPattern::MC_REASSOC_2); + else + Pattern.push_back(MachineCombinerPattern::MC_REASSOC_1); + } + + return true; +} + +/// Attempt the following reassociation to reduce critical path length: +/// A = (Anything) +/// B = A op X (Prev) +/// C = B op Y (Root) +/// ===> +/// A = (Anything) +/// B = X op Y +/// C = A op B +static void reassociateOps(MachineInstr &Root, + MachineOperand &OpA, MachineOperand &OpB, + MachineOperand &OpC, MachineOperand &OpX, + MachineOperand &OpY, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs) { + unsigned RegA = OpA.getReg(); + unsigned RegB = OpB.getReg(); + unsigned RegC = OpC.getReg(); + unsigned RegX = OpX.getReg(); + unsigned RegY = OpY.getReg(); + + MachineFunction *MF = Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + if (TargetRegisterInfo::isVirtualRegister(RegA)) + MRI.constrainRegClass(RegA, RC); + if (TargetRegisterInfo::isVirtualRegister(RegB)) + MRI.constrainRegClass(RegB, RC); + if (TargetRegisterInfo::isVirtualRegister(RegC)) + MRI.constrainRegClass(RegC, RC); + if (TargetRegisterInfo::isVirtualRegister(RegX)) + MRI.constrainRegClass(RegX, RC); + if (TargetRegisterInfo::isVirtualRegister(RegY)) + MRI.constrainRegClass(RegY, RC); + + bool KillA = OpA.isKill(); + bool KillB = OpB.isKill(); + bool KillX = OpX.isKill(); + bool KillY = OpY.isKill(); + + unsigned Opcode = Root.getOpcode(); + MachineInstr &Prev = *(MRI.getUniqueVRegDef(OpB.getReg())); + + // Create new instructions for insertion. + MachineInstrBuilder MIB1 = + BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), RegB) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegY, getKillRegState(KillY)); + InsInstrs.push_back(MIB1); + + MachineInstrBuilder MIB2 = + BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) + .addReg(RegA, getKillRegState(KillA)) + .addReg(RegB, getKillRegState(KillB)); + InsInstrs.push_back(MIB2); + + // Record old instructions for deletion. + DelInstrs.push_back(&Prev); + DelInstrs.push_back(&Root); +} + +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, + MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + + MachineBasicBlock &MBB = *Root.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // This is an unfortunate amount of code to match commuted variants + // of the pattern we want to reassociate. + switch (Pattern) { + default: + assert("Unknown pattern for machine combiner"); + break; + case MachineCombinerPattern::MC_REASSOC_1: { + // A = (Anything) + // B = A op X (Prev) + // C = B op Y (Root) + MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(1).getReg())); + MachineOperand &OpA = Prev.getOperand(1); + MachineOperand &OpB = Root.getOperand(1); + MachineOperand &OpC = Root.getOperand(0); + MachineOperand &OpX = Prev.getOperand(2); + MachineOperand &OpY = Root.getOperand(2); + + reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs); + break; + } + case MachineCombinerPattern::MC_REASSOC_2: { + // A = (Anything) + // B = A op X (Prev) + // C = Y op B (Root) + MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(2).getReg())); + MachineOperand &OpA = Prev.getOperand(1); + MachineOperand &OpB = Root.getOperand(2); + MachineOperand &OpC = Root.getOperand(0); + MachineOperand &OpX = Prev.getOperand(2); + MachineOperand &OpY = Root.getOperand(1); + + reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs); + break; + } + case MachineCombinerPattern::MC_REASSOC_3: { + // A = (Anything) + // B = X op A (Prev) + // C = B op Y (Root) + MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(1).getReg())); + MachineOperand &OpA = Prev.getOperand(2); + MachineOperand &OpB = Root.getOperand(1); + MachineOperand &OpC = Root.getOperand(0); + MachineOperand &OpX = Prev.getOperand(1); + MachineOperand &OpY = Root.getOperand(2); + + reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs); + break; + } + case MachineCombinerPattern::MC_REASSOC_4: { + // A = (Anything) + // B = X op A (Prev) + // C = Y op B (Root) + MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(2).getReg())); + MachineOperand &OpA = Prev.getOperand(2); + MachineOperand &OpB = Root.getOperand(2); + MachineOperand &OpC = Root.getOperand(0); + MachineOperand &OpX = Prev.getOperand(1); + MachineOperand &OpY = Root.getOperand(1); + + reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs); + break; + } + } // end switch (Pattern) + + return; +} + namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -24,6 +24,10 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +static cl::opt EnableMachineCombinerPass("x86-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(TheX86_32Target); @@ -224,6 +228,8 @@ bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); return true; } Index: test/CodeGen/X86/fp-fast.ll =================================================================== --- test/CodeGen/X86/fp-fast.ll +++ test/CodeGen/X86/fp-fast.ll @@ -114,3 +114,68 @@ ret float %t2 } +; Verify that the first two adds are independent regardless of how the inputs are +; commuted. The destination registers are used as source registers for the third add. + +define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not +; produced because that would cost more compile time. + +define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + %t3 = fadd float %t2, %x4 + %t4 = fadd float %t3, %x5 + %t5 = fadd float %t4, %x6 + %t6 = fadd float %t5, %x7 + ret float %t6 +}