Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -26,6 +26,19 @@ class X86RegisterInfo; class X86Subtarget; + namespace MachineCombinerPattern { + enum MC_PATTERN : int { + // These are commutative variants for reassociating a computation chain + // of the form: + // B = A op X (Prev) + // C = B op Y (Root) + MC_REASSOC_AX_BY = 0, + MC_REASSOC_AX_YB = 1, + MC_REASSOC_XA_BY = 2, + MC_REASSOC_XA_YB = 3, + }; + } // end namespace MachineCombinerPattern + namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -429,6 +442,26 @@ const MachineInstr *UseMI, unsigned UseIdx) const override; + + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// output in the array. + bool hasPattern( + MachineInstr &Root, + SmallVectorImpl &P) const override; + + /// When hasPattern() finds a pattern, this function generates the + /// instructions that could replace the original code sequence. + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -6226,6 +6226,210 @@ return isHighLatencyDef(DefMI->getOpcode()); } +/// If the input instruction is part of a chain of dependent ops that are +/// suitable for reassociation, return the earlier instruction in the sequence +/// that defines its first operand, otherwise return a nullptr. +/// If the instruction's operands must be commuted to be considered a +/// reassociation candidate, Commuted will be set to true. +static MachineInstr *isReassocCandidate(const MachineInstr &Inst, + unsigned AssocOpcode, + bool checkPrevOneUse, + bool &Commuted) { + if (Inst.getOpcode() != AssocOpcode) + return nullptr; + + MachineOperand Op1 = Inst.getOperand(1); + MachineOperand Op2 = Inst.getOperand(2); + + const MachineBasicBlock *MBB = Inst.getParent(); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + // We need virtual register definitions. + MachineInstr *MI1 = nullptr; + MachineInstr *MI2 = nullptr; + if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) + MI1 = MRI.getUniqueVRegDef(Op1.getReg()); + if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) + MI2 = MRI.getUniqueVRegDef(Op2.getReg()); + + // And they need to be in the trace (otherwise, they won't have a depth). + if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB) + return nullptr; + + Commuted = false; + if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) { + std::swap(MI1, MI2); + Commuted = true; + } + + // Avoid reassociating operands when it won't provide any benefit. If both + // operands are produced by instructions of this type, we may already + // have the optimal sequence. + if (MI2->getOpcode() == AssocOpcode) + return nullptr; + + // The instruction must only be used by the other instruction that we + // reassociate with. + if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) + return nullptr; + + // We must match a simple chain of dependent ops. + // TODO: This check is not necessary for the earliest instruction in the + // sequence. Instead of a sequence of 3 dependent instructions with the same + // opcode, we only need to find a sequence of 2 dependent instructions with + // the same opcode plus 1 other instruction that adds to the height of the + // trace. + if (MI1->getOpcode() != AssocOpcode) + return nullptr; + + return MI1; +} + +/// Select a pattern based on how the operands of each associative operation +/// need to be commuted. +static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev, + bool CommuteRoot) { + if (CommutePrev) { + if (CommuteRoot) + return MachineCombinerPattern::MC_REASSOC_XA_YB; + return MachineCombinerPattern::MC_REASSOC_XA_BY; + } else { + if (CommuteRoot) + return MachineCombinerPattern::MC_REASSOC_AX_YB; + return MachineCombinerPattern::MC_REASSOC_AX_BY; + } +} + +bool X86InstrInfo::hasPattern(MachineInstr &Root, + SmallVectorImpl &Pattern) const { + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + return false; + + // TODO: There are many more associative instruction types to match: + // 1. Other forms of scalar FP add (non-AVX) + // 2. Other data types (double, integer, vectors) + // 3. Other math / logic operations (mul, and, or) + unsigned AssocOpcode = X86::VADDSSrr; + + // TODO: There is nothing x86-specific here except the instruction type. + // This logic could be hoisted into the machine combiner pass itself. + bool CommuteRoot; + if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true, + CommuteRoot)) { + bool CommutePrev; + if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) { + // We found a sequence of instructions that may be suitable for a + // reassociation of operands to increase ILP. + Pattern.push_back(getPattern(CommutePrev, CommuteRoot)); + return true; + } + } + + return false; +} + +/// Attempt the following reassociation to reduce critical path length: +/// B = A op X (Prev) +/// C = B op Y (Root) +/// ===> +/// B = X op Y +/// C = A op B +static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, + MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) { + MachineFunction *MF = Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + // This array encodes the operand index for each parameter because the + // operands may be commuted. Each row corresponds to a pattern value, + // and each column specifies the index of A, B, X, Y. + unsigned OpIdx[4][4] = { + { 1, 1, 2, 2 }, + { 1, 2, 2, 1 }, + { 2, 1, 1, 2 }, + { 2, 2, 1, 1 } + }; + + MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); + MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); + MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); + MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); + MachineOperand &OpC = Root.getOperand(0); + + unsigned RegA = OpA.getReg(); + unsigned RegB = OpB.getReg(); + unsigned RegX = OpX.getReg(); + unsigned RegY = OpY.getReg(); + unsigned RegC = OpC.getReg(); + + if (TargetRegisterInfo::isVirtualRegister(RegA)) + MRI.constrainRegClass(RegA, RC); + if (TargetRegisterInfo::isVirtualRegister(RegB)) + MRI.constrainRegClass(RegB, RC); + if (TargetRegisterInfo::isVirtualRegister(RegX)) + MRI.constrainRegClass(RegX, RC); + if (TargetRegisterInfo::isVirtualRegister(RegY)) + MRI.constrainRegClass(RegY, RC); + if (TargetRegisterInfo::isVirtualRegister(RegC)) + MRI.constrainRegClass(RegC, RC); + + // Create a new virtual register for the result of (X op Y) instead of + // recycling RegB because the MachineCombiner's computation of the critical + // path requires a new register definition rather than an existing one. + unsigned NewVR = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + + unsigned Opcode = Root.getOpcode(); + bool KillA = OpA.isKill(); + bool KillX = OpX.isKill(); + bool KillY = OpY.isKill(); + + // Create new instructions for insertion. + MachineInstrBuilder MIB1 = + BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegY, getKillRegState(KillY)); + InsInstrs.push_back(MIB1); + + MachineInstrBuilder MIB2 = + BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) + .addReg(RegA, getKillRegState(KillA)) + .addReg(NewVR, getKillRegState(true)); + InsInstrs.push_back(MIB2); + + // Record old instructions for deletion. + DelInstrs.push_back(&Prev); + DelInstrs.push_back(&Root); +} + +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, + MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstIdxForVirtReg) const { + MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); + + // Select the previous instruction in the sequence based on the input pattern. + MachineInstr *Prev = nullptr; + if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY || + Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY) + Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB || + Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB) + Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + else + assert("Unknown pattern for machine combiner"); + + reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); + return; +} + namespace { /// Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -24,6 +24,10 @@ #include "llvm/Target/TargetOptions.h" using namespace llvm; +static cl::opt EnableMachineCombinerPass("x86-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(TheX86_32Target); @@ -224,6 +228,8 @@ bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); return true; } Index: test/CodeGen/X86/fp-fast.ll =================================================================== --- test/CodeGen/X86/fp-fast.ll +++ test/CodeGen/X86/fp-fast.ll @@ -114,3 +114,81 @@ ret float %t2 } +; Verify that the first two adds are independent regardless of how the inputs are +; commuted. The destination registers are used as source registers for the third add. + +define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not +; produced because that would cost more compile time. + +define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { +; CHECK-LABEL: reassociate_adds5: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm2, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm5, %xmm4, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm7, %xmm6, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + %t3 = fadd float %t2, %x4 + %t4 = fadd float %t3, %x5 + %t5 = fadd float %t4, %x6 + %t6 = fadd float %t5, %x7 + ret float %t6 +}