Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -52,6 +52,7 @@ #include #include #include +#include using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -17935,28 +17936,179 @@ return MBB; } -// Replace 213-type (isel default) FMA3 instructions with 231-type for -// accumulator loops. Writing back to the accumulator allows the coalescer -// to remove extra copies in the loop. -MachineBasicBlock * -X86TargetLowering::emitFMA3Instr(MachineInstr *MI, - MachineBasicBlock *MBB) const { - MachineOperand &AddendOp = MI->getOperand(3); +/// \brief Switches FMA213xxxx into FMA231xxxx form. +static unsigned switchFMA213To231(unsigned OldFMAOpc) { + switch (OldFMAOpc) { + case X86::VFMADDPDr213r: + return X86::VFMADDPDr231r; + case X86::VFMADDPSr213r: + return X86::VFMADDPSr231r; + case X86::VFMADDSDr213r: + return X86::VFMADDSDr231r; + case X86::VFMADDSSr213r: + return X86::VFMADDSSr231r; + case X86::VFMSUBPDr213r: + return X86::VFMSUBPDr231r; + case X86::VFMSUBPSr213r: + return X86::VFMSUBPSr231r; + case X86::VFMSUBSDr213r: + return X86::VFMSUBSDr231r; + case X86::VFMSUBSSr213r: + return X86::VFMSUBSSr231r; + case X86::VFNMADDPDr213r: + return X86::VFNMADDPDr231r; + case X86::VFNMADDPSr213r: + return X86::VFNMADDPSr231r; + case X86::VFNMADDSDr213r: + return X86::VFNMADDSDr231r; + case X86::VFNMADDSSr213r: + return X86::VFNMADDSSr231r; + case X86::VFNMSUBPDr213r: + return X86::VFNMSUBPDr231r; + case X86::VFNMSUBPSr213r: + return X86::VFNMSUBPSr231r; + case X86::VFNMSUBSDr213r: + return X86::VFNMSUBSDr231r; + case X86::VFNMSUBSSr213r: + return X86::VFNMSUBSSr231r; + case X86::VFMADDPDr213rY: + return X86::VFMADDPDr231rY; + case X86::VFMADDPSr213rY: + return X86::VFMADDPSr231rY; + case X86::VFMSUBPDr213rY: + return X86::VFMSUBPDr231rY; + case X86::VFMSUBPSr213rY: + return X86::VFMSUBPSr231rY; + case X86::VFNMADDPDr213rY: + return X86::VFNMADDPDr231rY; + case X86::VFNMADDPSr213rY: + return X86::VFNMADDPSr231rY; + case X86::VFNMSUBPDr213rY: + return X86::VFNMSUBPDr231rY; + case X86::VFNMSUBPSr213rY: + return X86::VFNMSUBPSr231rY; + default: + llvm_unreachable("Unrecognized FMA variant."); + } +} - // Bail out early if the addend isn't a register - we can't switch these. - if (!AddendOp.isReg()) - return MBB; +/// \brief Switches FMA213xxxx into FMA132xxxx form. +static unsigned switchFMA213To132(unsigned OldFMAOpc) { + switch (OldFMAOpc) { + case X86::VFMADDPDr213r: + return X86::VFMADDPDr132r; + case X86::VFMADDPSr213r: + return X86::VFMADDPSr132r; + case X86::VFMADDSDr213r: + return X86::VFMADDSDr132r; + case X86::VFMADDSSr213r: + return X86::VFMADDSSr132r; + case X86::VFMSUBPDr213r: + return X86::VFMSUBPDr132r; + case X86::VFMSUBPSr213r: + return X86::VFMSUBPSr132r; + case X86::VFMSUBSDr213r: + return X86::VFMSUBSDr132r; + case X86::VFMSUBSSr213r: + return X86::VFMSUBSSr132r; + case X86::VFNMADDPDr213r: + return X86::VFNMADDPDr132r; + case X86::VFNMADDPSr213r: + return X86::VFNMADDPSr132r; + case X86::VFNMADDSDr213r: + return X86::VFNMADDSDr132r; + case X86::VFNMADDSSr213r: + return X86::VFNMADDSSr132r; + case X86::VFNMSUBPDr213r: + return X86::VFNMSUBPDr132r; + case X86::VFNMSUBPSr213r: + return X86::VFNMSUBPSr132r; + case X86::VFNMSUBSDr213r: + return X86::VFNMSUBSDr132r; + case X86::VFNMSUBSSr213r: + return X86::VFNMSUBSSr132r; + case X86::VFMADDPDr213rY: + return X86::VFMADDPDr132rY; + case X86::VFMADDPSr213rY: + return X86::VFMADDPSr132rY; + case X86::VFMSUBPDr213rY: + return X86::VFMSUBPDr132rY; + case X86::VFMSUBPSr213rY: + return X86::VFMSUBPSr132rY; + case X86::VFNMADDPDr213rY: + return X86::VFNMADDPDr132rY; + case X86::VFNMADDPSr213rY: + return X86::VFNMADDPSr132rY; + case X86::VFNMSUBPDr213rY: + return X86::VFNMSUBPDr132rY; + case X86::VFNMSUBPSr213rY: + return X86::VFNMSUBPSr132rY; + default: + llvm_unreachable("Unrecognized FMA variant."); + } +} + +/// \brief Returns true if the given machine operand mio +/// is defined via a full copy from the given physical register physReg. +static bool isOperandFullCopyFromReg(MachineRegisterInfo &MRI, + MachineOperand &mio, unsigned physReg) { + assert(mio.isReg()); + auto &def = *MRI.def_instr_begin(mio.getReg()); + return def.isFullCopy() && def.getOperand(1).getReg() == physReg; +} + +/// \brief Given the operands permutation (for ex. [0,2,3,1]), +/// returns FMA form index for which produces %0 = %1 * %2 + %3. +static unsigned canonicalizeFMA(const std::array &Operands) { + const unsigned From = 100 * Operands[1] + 10 * Operands[2] + 1 * Operands[3]; + switch (From) { + case 132: + case 231: + return 132; + case 123: + case 213: + return 213; + case 312: + case 321: + return 231; + default: + llvm_unreachable("Unexpected FMA form"); + } +} + +// Try optimize FMA3 in 3 ways: +// 1) Replace 213-type (isel default) FMA3 instructions with 231-type for +// accumulator loops. Writing back to the accumulator allows the coalescer +// to remove extra copies in the loop. +// 2) If 1st operand (tied with dest) is used only for copying to physical reg, +// then look for operand which is defined by copying from this phys. +// register. +// If such operand found, make it a 1st to eliminate excessive MOVs. +// 3) Try to place killed operands to the 1st place. It can save phys register +// and avoid copying. +// 4) Make memory loading operand a 3rd to help folding. +// +// First we permute operands and then find the most suitable FMA form for it. +MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { + + // Sanity checks. + assert(MI->getNumOperands() == 4 && "FMA3 must have 4 operands."); + // Expect registers only: no mem, no immediates. + for (unsigned i = 0; i < MI->getNumOperands(); ++i) + if (!MI->getOperand(i).isReg()) + return MBB; + + // Initial operands permutation corresponding to FMA213. + const std::array OperandsId = {0, 1, 2, 3}; + // Optimized operands permutation. + std::array Operands = OperandsId; + // 1st operand index was optimized earlier and must not be changed. + bool FirstFixed = false; MachineFunction &MF = *MBB->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); - // Check whether the addend is defined by a PHI: - assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); - MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); - if (!AddendDef.isPHI()) - return MBB; - - // Look for the following pattern: + // 1) Look for the following pattern: // loop: // %addend = phi [%entry, 0], [%loop, %result] // ... @@ -17968,53 +18120,117 @@ // ... // %result = FMA231 %addend, %m1, %m2 - for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { - assert(AddendDef.getOperand(i).isReg()); - MachineOperand PHISrcOp = AddendDef.getOperand(i); - MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); - if (&PHISrcInst == MI) { - // Found a matching instruction. - unsigned NewFMAOpc = 0; - switch (MI->getOpcode()) { - case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break; - case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break; - case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break; - case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break; - case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break; - case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break; - case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break; - case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break; - case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break; - case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break; - case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break; - case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break; - case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break; - case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; - case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; - case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; - case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; - case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; - case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; - case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break; - case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break; - case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; - case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; - case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; - default: llvm_unreachable("Unrecognized FMA variant."); + if (!FirstFixed) { + const unsigned AddendIndex = 3; + MachineOperand &AddendOp = MI->getOperand(Operands[AddendIndex]); + // Check whether the addend is defined by a PHI: + assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?"); + MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg()); + if (AddendDef.isPHI()) { + for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) { + MachineOperand PHISrcOp = AddendDef.getOperand(i); + MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg()); + if (&PHISrcInst == MI) { + std::swap(Operands[1], Operands[AddendIndex]); + // Fix the first operand, so it cannot be replaced later. + FirstFixed = true; + break; + } } + } + } - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - MachineInstrBuilder MIB = - BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(1)); - MBB->insert(MachineBasicBlock::iterator(MI), MIB); - MI->eraseFromParent(); + // Try to change destination registers only if this FMA has exactly 1 usage. + if (MRI.hasOneUse(MI->getOperand(Operands[0]).getReg())) { + // 2) Look for pattern + // + // %a = ... + // %b = ... + // %c = ... + // %res = FMAxxx %a, %b, %c + // %PHYS_REG = COPY %res + // + // and replace it with FMA form where first operand is copied + // directly from %PHYS_REG, if possible. It eliminates COPY after FMA. + // + if (!FirstFixed) { + do { + // a) Check that FMA result is copied into phys reg. + auto UseOfFMADefIt = + MRI.use_instr_begin(MI->getOperand(Operands[0]).getReg()); + // If this FMA is useless, do nothing: it will be removed later by DCE. + if (UseOfFMADefIt == MRI.use_instr_end()) + break; + MachineInstr &UseOfFMADef = *UseOfFMADefIt; + const unsigned FMADefReg = UseOfFMADef.getOperand(0).getReg(); + if (!UseOfFMADef.isFullCopy() || + !TargetRegisterInfo::isPhysicalRegister(FMADefReg)) + break; + + // b) Find the operand which is defined by copying from the same phys + // register to which FMA writes. + // Also check the first operand: if it matches, then the existing FMA + // schema is optimal. + for (unsigned i = 1; i <= 3; ++i) + if (isOperandFullCopyFromReg(MRI, MI->getOperand(Operands[i]), + FMADefReg)) { + // Fix 1st operand. + FirstFixed = true; + std::swap(Operands[1], Operands[i]); + break; + } + } while (false); + } + + // 3) Find first killed operand and make it first, if possible, to reuse its + // physical register. + if (!FirstFixed) { + do { + for (unsigned i = 1; i <= 3; ++i) { + MachineOperand &Operand = MI->getOperand(Operands[i]); + if (Operand.isKill()) { + FirstFixed = true; + std::swap(Operands[1], Operands[i]); + break; + } + } + } while (false); } } + // 4) Try to make memory accessing operand the 3rd to help memory folding. + // We can't move 1st operand if it's already fixed on prev. step. + for (unsigned i = 3; i > (FirstFixed ? 1 : 0); --i) { + MachineOperand &Operand = MI->getOperand(Operands[i]); + if (MRI.def_instr_begin(Operand.getReg())->canFoldAsLoad()) { + // Operand may load memory, so move it to 3d place to have a chance of + // folding. + std::swap(Operands[3], Operands[i]); + break; + } + } + + // Operands order changed? We need to regenerate the instruction. + if (Operands != OperandsId) { + const unsigned NewFMAOp = canonicalizeFMA(Operands); + assert(NewFMAOp == 132 || NewFMAOp == 213 || NewFMAOp == 231); + const unsigned CurrFMAOpc = MI->getOpcode(); + unsigned NewFMAOpc; + if (NewFMAOp == 132) + NewFMAOpc = switchFMA213To132(CurrFMAOpc); + else if (NewFMAOp == 231) + NewFMAOpc = switchFMA213To231(CurrFMAOpc); + else if (NewFMAOp == 213) + NewFMAOpc = CurrFMAOpc; + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(Operands[1])) + .addOperand(MI->getOperand(Operands[2])) + .addOperand(MI->getOperand(Operands[3])); + MBB->insert(MachineBasicBlock::iterator(MI), MIB); + MI->eraseFromParent(); + } return MBB; } Index: lib/Target/X86/X86InstrFMA.td =================================================================== --- lib/Target/X86/X86InstrFMA.td +++ lib/Target/X86/X86InstrFMA.td @@ -72,7 +72,9 @@ let neverHasSideEffects = 1 in { defm r132 : fma3p_rm; + MemFrag128, MemFrag256, OpTy128, OpTy256, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 0>; // For 231, only the register variant is commutable. // For the memory variant the folded operand must be in 3. Thus, // in that case, it cannot be swapped with 2. @@ -157,7 +159,9 @@ ComplexPattern mem_cpat> { let neverHasSideEffects = 1 in { defm r132 : fma3s_rm; + x86memop, RC, OpVT, mem_frag, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 0>; // See the other defm of r231 for the explanation regarding the // commutable flags. defm r231 : fma3s_rm