Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -71,7 +71,10 @@ FMLSv2f32_OP2, FMLSv2f64_OP2, FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv4f32_OP2, + + // This is FDIV-RECIP pattern matched by X86 machine combiner + Div2RecipEst }; } // end namespace llvm Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14947,8 +14947,8 @@ /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { - if (Level >= AfterLegalizeDAG) - return SDValue(); + // if (Level >= AfterLegalizeDAG) + return SDValue(); // TODO: Handle half and/or extended types? EVT VT = Op.getValueType(); Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -504,6 +504,21 @@ return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -9190,6 +9190,331 @@ } } +static bool hasAllOnesOperand(MachineInstr *MI) { + auto Constants = + MI->getParent()->getParent()->getConstantPool()->getConstants(); + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + MachineOperand &MO = *MOI; + if (MO.isCPI()) { + // We have a Constant Pool Index operand in this instruction + // FIXME: should we deal with other types of operand like Immediate? + auto ConstantEntry = Constants[MO.getIndex()]; + if (!ConstantEntry.isMachineConstantPoolEntry()) { + if (auto *C = dyn_cast(ConstantEntry.Val.ConstVal)) + if (C->isAllOnesValue()) + return true; + } + } + } + return false; +} + +static bool isDividentAllOnes(MachineFunction &MF, unsigned DividentReg) { + // The divident could be AllOnes value and in this case we should not create + // additional constant for reciprocal division but use the divident instead. + // We're trying to find the devident definition and if it's a constant + // AllOnes value we'll use it + for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); + MBBI != MBBE; ++MBBI) { + for (MachineBasicBlock::instr_iterator MII = MBBI->instr_begin(), + MIE = MBBI->instr_end(); + MII != MIE; ++MII) { + MachineInstr *MI = &*MII; + + for (MachineInstr::mop_iterator MOI = MI->operands_begin(), + MOE = MI->operands_end(); + MOI != MOE; ++MOI) { + MachineOperand &MO = *MOI; + if (!MO.isReg() || !MO.isDef()) + continue; + if (MO.getReg() == DividentReg) { + return hasAllOnesOperand(MI); + } + } + } + } + return false; +} + +/// genFDivReciprocal - Generates A = B * 1/C instead of A = B/C +/// +/// To get more precision we're using Newton-Raphson iterations like here: +/// +/// X[0] = reciprocal (C); +/// X[i+1] = X[i] + X[i] * (1 - C * X[i]); every iteration increases precision +/// +/// And the result of division will be here: A = B * X +/// Example (-x86-asm-syntax=intel): instead of +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vdivss xmm0, xmm1, xmm0 +/// +/// we're generating +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vrcpss xmm2, xmm0, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vsubss xmm0, xmm1, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vaddss xmm0, xmm0, xmm2 + +static void genReciprocalDiv(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + SmallVector Instrs, int Iterations, + Type *Ty, X86Subtarget &Subtarget) { + + unsigned ResultReg = Root.getOperand(0).getReg(); + + MachineOperand Divident = Root.getOperand(1); + unsigned DividentReg = Divident.getReg(); + bool DividentIsKill = Divident.isKill(); + bool DividentIsAllOnes = isDividentAllOnes(MF, DividentReg); + + unsigned DividerReg = Root.getOperand(2).getReg(); + bool DividerIsKill = Root.getOperand(2).isKill(); + + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividentReg)) + MRI.constrainRegClass(DividentReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividerReg)) + MRI.constrainRegClass(DividerReg, RC); + + // Initial estimate value is recipocal division of C + // rcp + MachineInstrBuilder RcpMI; + unsigned RcpVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(RcpVReg, 0)); + if (Instrs[0] == X86::VRCPSSr) + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), RcpVReg) + .addReg(DividentReg, getKillRegState(DividentIsKill)) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + else + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + InsInstrs.push_back(RcpMI); + + unsigned LoadVReg = 0; + if (!DividentIsAllOnes) { + // We need all ones value to be able to do (1 - C * X[i]) + // load + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (MF.getTarget().isPositionIndependent()) { + if (Subtarget.is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return; + } + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + // const Constant *C = Constant::getAllOnesValue(Ty); + auto *CFP = ConstantFP::get(Ty, 1.0); + unsigned CPI = MCP.getConstantPoolIndex(CFP, 4); + LoadVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(LoadVReg, 0)); + + MachineInstrBuilder LoadMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), LoadVReg) + .addReg(PICBase) + .addImm(1) + .addReg(0) + .addConstantPoolIndex(CPI) + .addReg(0); + InsInstrs.push_back(LoadMI); + } + if (Iterations < 1) + Iterations = 1; // undefined or zero iterations should produce 1 iteration + unsigned EstVReg = RcpVReg; // X[0] = reciprocal (C); + + for (int i = 0; i < Iterations; i++) { + // mul + unsigned MulVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MulVReg, 0)); + MachineInstrBuilder MulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), MulVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(EstVReg); + InsInstrs.push_back(MulMI); // C * X[i] + + // sub + unsigned SubVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubVReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3]), SubVReg) + .addReg(DividentIsAllOnes ? DividentReg : LoadVReg) + .addReg(MulVReg); + InsInstrs.push_back(SubMI); // 1 - C * X[i] + + // mul2 + unsigned Mul2VReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(Mul2VReg, 0)); + MachineInstrBuilder Mul2MI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), Mul2VReg) + .addReg(SubVReg) + .addReg(EstVReg); + InsInstrs.push_back(Mul2MI); // X[i] * (1 - C * X[i]) + + // add + unsigned AddVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(AddVReg, 0)); + MachineInstrBuilder AddMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), AddVReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + InsInstrs.push_back(AddMI); // X[i] + X[i] * (1 - C * X[i]) + EstVReg = AddVReg; + } + // The final multiplication B * 1/C + // ResultMul + MachineInstrBuilder ResultMulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[6]), ResultReg) + .addReg(DividentReg) + .addReg(EstVReg); + InsInstrs.push_back(ResultMulMI); + return; +} + +static EVT getFDivEVT(MachineInstr &Root) { + // FIXME: should we support other kinds of DIV? + switch (Root.getOpcode()) { + default: + break; + case X86::DIVSSrr: // f32 + case X86::VDIVSSrr: // f32 + return MVT::f32; + case X86::DIVPSrr: // v4f32 + case X86::VDIVPSrr: // v4f32 + return MVT::v4f32; + case X86::VDIVPSYrr: // v8f32 + return MVT::v8f32; + } + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + + MachineBasicBlock &MBB = *Root.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); + // Estimates may be explicitly enabled for this type with a custom number of + // refinement steps. + int Iterations = TLI->getDivRefinementSteps(getFDivEVT(Root), MF); + DEBUG(dbgs() << "Alternative code sequense for " << MF.getName() << " with " + << Iterations << " iterations\n"); + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::Div2RecipEst: + switch (Root.getOpcode()) { + default: + return; + case X86::VDIVSSrr: // f32 + genReciprocalDiv( + MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPSSr, X86::VMULSSrr, X86::VMOVSSrm, X86::VSUBSSrr, + X86::VMULSSrr, X86::VADDSSrr, X86::VMULSSrr}, + Iterations, Type::getFloatTy(MF.getFunction()->getContext()), + Subtarget); + break; + case X86::VDIVPSrr: // v4f32 + genReciprocalDiv( + MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSr, X86::VMULPSrr, X86::VMOVAPSrm, X86::VSUBPSrr, + X86::VMULPSrr, X86::VADDPSrr, X86::VMULPSrr}, + Iterations, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + case X86::VDIVPSYrr: // v8f32 + genReciprocalDiv( + MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSYr, X86::VMULPSYrr, X86::VMOVAPSYrm, X86::VSUBPSYrr, + X86::VMULPSYrr, X86::VADDPSYrr, X86::VMULPSYrr}, + Iterations, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8), + Subtarget); + break; + case X86::DIVSSrr: // f32 + genReciprocalDiv( + MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, + { + X86::RCPSSr, X86::MULSSrr, X86::MOVSSrm, X86::SUBSSrr, + X86::MULSSrr, X86::ADDSSrr, X86::MULSSrr, + }, + Iterations, Type::getFloatTy(MF.getFunction()->getContext()), + Subtarget); + break; + case X86::DIVPSrr: // v4f32 + genReciprocalDiv( + MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPPSr, X86::MULPSrr, X86::MOVAPSrr, X86::SUBPSrr, X86::MULPSrr, + X86::ADDPSrr, X86::MULPSrm}, + Iterations, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + } + break; + } + DelInstrs.push_back(&Root); // Record FDiv for deletion +} + +/// Find instructions that can be turned into recip +static bool getFDIVPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + auto *MF = Root.getParent()->getParent(); + auto TLI = MF->getSubtarget().getTargetLowering(); + EVT VT = getFDivEVT(Root); + if (VT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; + int Enabled = TLI->getRecipEstimateDivEnabled(VT, *MF); + if (Enabled == TLI->ReciprocalEstimate::Disabled) + return false; + Patterns.push_back(MachineCombinerPattern::Div2RecipEst); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getFDIVPatterns(Root, Patterns)) + return true; + // TODO: FSQRT patterns will be prepared after reciprocal implementation + // completes + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,