Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -71,7 +71,10 @@ FMLSv2f32_OP2, FMLSv2f64_OP2, FMLSv4i32_indexed_OP2, - FMLSv4f32_OP2 + FMLSv4f32_OP2, + + // This is FDIV-RECIP pattern matched by X86 machine combiner + Div2RecipEst }; } // end namespace llvm Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -1149,9 +1149,10 @@ // // Debugging support // - void print(raw_ostream &OS, bool SkipOpers = false) const; - void print(raw_ostream &OS, ModuleSlotTracker &MST, - bool SkipOpers = false) const; + void print(raw_ostream &OS, bool SkipOpers = false, + const MachineBasicBlock *P = nullptr) const; + void print(raw_ostream &OS, ModuleSlotTracker &MST, bool SkipOpers = false, + const MachineBasicBlock *P = nullptr) const; void dump() const; //===--------------------------------------------------------------------===// Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -152,9 +152,13 @@ assert(DefInstr && "There must be a definition for a new virtual register"); DepthOp = InstrDepth[II->second]; - LatencyOp = TSchedModel.computeOperandLatency( - DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), - InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg()); + int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg()); + if (DefIdx < 0 || UseIdx < 0) + LatencyOp = TII->defaultDefLatency(SchedModel, *DefInstr); + else + LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, + InstrPtr, UseIdx); } else { MachineInstr *DefInstr = getOperandDef(MO); if (DefInstr) { Index: lib/CodeGen/MachineInstr.cpp =================================================================== --- lib/CodeGen/MachineInstr.cpp +++ lib/CodeGen/MachineInstr.cpp @@ -1690,26 +1690,28 @@ #endif } -void MachineInstr::print(raw_ostream &OS, bool SkipOpers) const { +void MachineInstr::print(raw_ostream &OS, bool SkipOpers, + const MachineBasicBlock *P) const { const Module *M = nullptr; if (const MachineBasicBlock *MBB = getParent()) if (const MachineFunction *MF = MBB->getParent()) M = MF->getFunction()->getParent(); ModuleSlotTracker MST(M); - print(OS, MST, SkipOpers); + print(OS, MST, SkipOpers, P); } void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST, - bool SkipOpers) const { + bool SkipOpers, const MachineBasicBlock *P) const { // We can be a bit tidier if we know the MachineFunction. const MachineFunction *MF = nullptr; const TargetRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; const TargetInstrInfo *TII = nullptr; const TargetIntrinsicInfo *IntrinsicInfo = nullptr; + const MachineBasicBlock *MBB = P ? P : getParent(); - if (const MachineBasicBlock *MBB = getParent()) { + if (MBB) { MF = MBB->getParent(); if (MF) { MRI = &MF->getRegInfo(); Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -359,7 +359,6 @@ SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); - SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags); SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags); SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip); @@ -9029,12 +9028,6 @@ } } } - - // Fold into a reciprocal estimate and multiply instead of a real divide. - if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) { - AddToWorklist(RV.getNode()); - return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); - } } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) @@ -14939,59 +14932,6 @@ } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) -/// For the reciprocal, we need to find the zero of the function: -/// F(X) = A X - 1 [which has a zero at X = 1/A] -/// => -/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form -/// does not require additional intermediate precision] -SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) { - if (Level >= AfterLegalizeDAG) - return SDValue(); - - // TODO: Handle half and/or extended types? - EVT VT = Op.getValueType(); - if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) - return SDValue(); - - // If estimates are explicitly disabled for this function, we're done. - MachineFunction &MF = DAG.getMachineFunction(); - int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); - if (Enabled == TLI.ReciprocalEstimate::Disabled) - return SDValue(); - - // Estimates may be explicitly enabled for this type with a custom number of - // refinement steps. - int Iterations = TLI.getDivRefinementSteps(VT, MF); - if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { - AddToWorklist(Est.getNode()); - - if (Iterations) { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); - - // Newton iterations: Est = Est + Est (1 - Arg * Est) - for (int i = 0; i < Iterations; ++i) { - SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); - AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags); - AddToWorklist(NewEst.getNode()); - - NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); - AddToWorklist(NewEst.getNode()); - - Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags); - AddToWorklist(Est.getNode()); - } - } - return Est; - } - - return SDValue(); -} - -/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -504,6 +504,21 @@ return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential patterns are + /// listed in the array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; bool hasReassociableOperands(const MachineInstr &Inst, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -9190,6 +9190,339 @@ } } +static bool hasAllOnesOperand(MachineInstr &MI) { + auto Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + for (auto &MO : MI.operands()) { + if (MO.isCPI()) { + // We have a Constant Pool Index operand in this instruction + // FIXME: should we deal with other types of operand like Immediate? + auto ConstantEntry = Constants[MO.getIndex()]; + // FIXME: what should we do with MachineConstantPoolEntry? + if (!ConstantEntry.isMachineConstantPoolEntry()) { + if (auto *C = dyn_cast(ConstantEntry.Val.ConstVal)) { + if (C->getType()->isVectorTy()) { + if (!(C = C->getSplatValue())) + return false; + } + if (auto *CFP = dyn_cast(C)) + return CFP->isExactlyValue(1.0); + } + } + } + } + return false; +} + +static bool isDividendAllOnes(MachineFunction &MF, unsigned DividendReg) { + // The dividend could be AllOnes value and in this case we should not create + // additional constant for reciprocal division but use the dividend instead. + // We're trying to find the devident definition and if it's a constant + // AllOnes value we'll use it + for (auto &MBB : MF) { + for (auto &MI : MBB) { + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + if (MO.getReg() == DividendReg) { + return hasAllOnesOperand(MI); + } + } + } + } + return false; +} + +static EVT getFDivEVT(MachineInstr &Root) { + // FIXME: should we support other kinds of DIV? + switch (Root.getOpcode()) { + default: + break; + case X86::DIVSSrr: // f32 + case X86::VDIVSSrr: // f32 + return MVT::f32; + case X86::DIVPSrr: // v4f32 + case X86::VDIVPSrr: // v4f32 + return MVT::v4f32; + case X86::VDIVPSYrr: // v8f32 + return MVT::v8f32; + } + return MVT::INVALID_SIMPLE_VALUE_TYPE; +} + +/// genFDivReciprocal - Generates A = B * 1/C instead of A = B/C +/// +/// To get more precision we're using Newton-Raphson iterations like here: +/// +/// X[0] = reciprocal (C); +/// X[i+1] = X[i] + X[i] * (1 - C * X[i]); every iteration increases precision +/// +/// And the result of division will be here: A = B * X +/// Example (-x86-asm-syntax=intel): instead of +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vdivss xmm0, xmm1, xmm0 +/// +/// we're generating +/// +/// vmovss xmm1, dword ptr [rip + .LCPI0_0] # xmm1 = mem[0],zero,zero,zero +/// vrcpss xmm2, xmm0, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vsubss xmm0, xmm1, xmm0 +/// vmulss xmm0, xmm0, xmm2 +/// vaddss xmm0, xmm0, xmm2 + +static void genReciprocalDiv(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + DenseMap &InstrIdxForVirtReg, + ArrayRef Instrs, Type *Ty, + X86Subtarget &Subtarget) { + + MachineBasicBlock *MBB = Root.getParent(); + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const TargetLowering *TLI = MF.getSubtarget().getTargetLowering(); + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + int Iterations = TLI->getDivRefinementSteps(getFDivEVT(Root), MF); + + unsigned ResultReg = Root.getOperand(0).getReg(); + + MachineOperand Dividend = Root.getOperand(1); + unsigned DividendReg = Dividend.getReg(); + bool DividendIsAllOnes = isDividendAllOnes(MF, DividendReg); + + unsigned DividerReg = Root.getOperand(2).getReg(); + bool DividerIsKill = Root.getOperand(2).isKill(); + + if (TargetRegisterInfo::isVirtualRegister(ResultReg)) + MRI.constrainRegClass(ResultReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividendReg)) + MRI.constrainRegClass(DividendReg, RC); + if (TargetRegisterInfo::isVirtualRegister(DividerReg)) + MRI.constrainRegClass(DividerReg, RC); + + if (Iterations < 0) + Iterations = 1; // undefined value of iterations should produce 1 iteration + + // 0: rcp + // Initial estimate value is recipocal division of C + MachineInstrBuilder RcpMI; + // Iff DivIsRcp == true Then div ~= rcp without any additional refinement + bool DivIsRcp = DividendIsAllOnes && !Iterations; + + unsigned RcpVReg; + if (!DivIsRcp) { + // We need refinement and only because of that we need this vreg + RcpVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(RcpVReg, 0)); + } + if (Instrs[0] == X86::VRCPSSr) + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + else + RcpMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[0]), + DivIsRcp ? ResultReg : RcpVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)); + InsInstrs.push_back(RcpMI); + + unsigned LoadVReg = 0; + if (!DividendIsAllOnes && Iterations) { + // 2: load + // We need all ones value to be able to do (1 - C * X[i]) + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (MF.getTarget().isPositionIndependent()) { + if (Subtarget.is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return; + } + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + // const Constant *C = Constant::getAllOnesValue(Ty); + auto *CFP = ConstantFP::get(Ty, 1.0); + unsigned CPI = MCP.getConstantPoolIndex(CFP, 4); + LoadVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(LoadVReg, 0)); + + MachineInstrBuilder LoadMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[2]), LoadVReg) + .addReg(PICBase) + .addImm(1) + .addReg(0) + .addConstantPoolIndex(CPI) + .addReg(0); + InsInstrs.push_back(LoadMI); + } + unsigned EstVReg = RcpVReg; // X[0] = reciprocal (C); + + for (int i = 0; i < Iterations; i++) { + // 1: mul + unsigned MulVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(MulVReg, 0)); + MachineInstrBuilder MulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[1]), MulVReg) + .addReg(DividerReg, getKillRegState(DividerIsKill)) + .addReg(EstVReg); + InsInstrs.push_back(MulMI); // C * X[i] + + // 3: sub + unsigned SubVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(SubVReg, 0)); + MachineInstrBuilder SubMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[3]), SubVReg) + .addReg(DividendIsAllOnes ? DividendReg : LoadVReg) + .addReg(MulVReg); + InsInstrs.push_back(SubMI); // 1 - C * X[i] + + // 4: mul2 + unsigned Mul2VReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(Mul2VReg, 0)); + MachineInstrBuilder Mul2MI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[4]), Mul2VReg) + .addReg(SubVReg) + .addReg(EstVReg); + InsInstrs.push_back(Mul2MI); // X[i] * (1 - C * X[i]) + + // 5: add + MachineInstrBuilder AddMI; + if (DividendIsAllOnes && (i + 1 == Iterations)) + AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), ResultReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + else { + unsigned AddVReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(AddVReg, 0)); + AddMI = BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[5]), AddVReg) + .addReg(Mul2VReg) + .addReg(EstVReg); + EstVReg = AddVReg; + } + InsInstrs.push_back(AddMI); // X[i] + X[i] * (1 - C * X[i]) + } + if (!DividendIsAllOnes) { + // 6: result mul + // The final multiplication B * 1/C + MachineInstrBuilder ResultMulMI = + BuildMI(MF, Root.getDebugLoc(), TII->get(Instrs[6]), ResultReg) + .addReg(DividendReg) + .addReg(EstVReg); + InsInstrs.push_back(ResultMulMI); + } + return; +} + +/// When getMachineCombinerPatterns() finds potential patterns, +/// this function generates the instructions that could replace the +/// original code sequence +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineFunction &MF = *MBB.getParent(); + + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::Div2RecipEst: + switch (Root.getOpcode()) { + default: + return; + case X86::VDIVSSrr: // f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPSSr, X86::VMULSSrr, X86::VMOVSSrm, X86::VSUBSSrr, + X86::VMULSSrr, X86::VADDSSrr, X86::VMULSSrr}, + Type::getFloatTy(MF.getFunction()->getContext()), Subtarget); + break; + case X86::VDIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSr, X86::VMULPSrr, X86::VMOVAPSrm, X86::VSUBPSrr, + X86::VMULPSrr, X86::VADDPSrr, X86::VMULPSrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + case X86::VDIVPSYrr: // v8f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::VRCPPSYr, X86::VMULPSYrr, X86::VMOVAPSYrm, X86::VSUBPSYrr, + X86::VMULPSYrr, X86::VADDPSYrr, X86::VMULPSYrr}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8), + Subtarget); + break; + case X86::DIVSSrr: // f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + { + X86::RCPSSr, X86::MULSSrr, X86::MOVSSrm, X86::SUBSSrr, + X86::MULSSrr, X86::ADDSSrr, X86::MULSSrr, + }, + Type::getFloatTy(MF.getFunction()->getContext()), Subtarget); + break; + case X86::DIVPSrr: // v4f32 + genReciprocalDiv( + Root, InsInstrs, InstrIdxForVirtReg, + {X86::RCPPSr, X86::MULPSrr, X86::MOVAPSrr, X86::SUBPSrr, X86::MULPSrr, + X86::ADDPSrr, X86::MULPSrm}, + VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 4), + Subtarget); + break; + } + break; + } + DEBUG(dbgs() << "\nAlternate sequence for " << MF.getName() << "\n"; + for (unsigned i = 0; i < InsInstrs.size(); i++) { + dbgs() << i << ": "; + InsInstrs[i]->print(dbgs(), false, Root.getParent()); + }); + DelInstrs.push_back(&Root); // Record FDiv for deletion +} + +/// Find instructions that can be turned into recip +static bool getFDIVPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + auto *MF = Root.getParent()->getParent(); + auto TLI = MF->getSubtarget().getTargetLowering(); + EVT VT = getFDivEVT(Root); + if (VT == MVT::INVALID_SIMPLE_VALUE_TYPE) + return false; + int Enabled = TLI->getRecipEstimateDivEnabled(VT, *MF); + if (Enabled == TLI->ReciprocalEstimate::Disabled) + return false; + Patterns.push_back(MachineCombinerPattern::Div2RecipEst); + return true; +} + +/// Return true when there is potentially a faster code sequence for an +/// instruction chain ending in \p Root. All potential patterns are listed in +/// the \p Pattern vector. Pattern should be sorted in priority order since the +/// pattern evaluator stops checking as soon as it finds a faster sequence. + +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + // FDIV patterns + if (getFDIVPatterns(Root, Patterns)) + return true; + // TODO: FSQRT patterns will be prepared after reciprocal implementation + // completes + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + /// This is an architecture-specific helper function of reassociateOps. /// Set special operand attributes for new instructions after reassociation. void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,