Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -7964,326 +7964,6 @@ return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs, - int PtrOffset = 0) { - unsigned NumAddrOps = MOs.size(); - - if (NumAddrOps < 4) { - // FrameIndex only - add an immediate offset (whether its zero or not). - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.add(MOs[i]); - addOffset(MIB, PtrOffset); - } else { - // General Memory Addressing - we need to add any offset to an existing - // offset. - assert(MOs.size() == 5 && "Unexpected memory operand list length"); - for (unsigned i = 0; i != NumAddrOps; ++i) { - const MachineOperand &MO = MOs[i]; - if (i == 3 && PtrOffset != 0) { - MIB.addDisp(MO, PtrOffset); - } else { - MIB.add(MO); - } - } - } -} - -static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, - ArrayRef MOs, - MachineBasicBlock::iterator InsertPt, - MachineInstr &MI, - const TargetInstrInfo &TII) { - // Create the base instruction with the memory operand as the first part. - // Omit the implicit operands, something BuildMI can't do. - MachineInstr *NewMI = - MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); - MachineInstrBuilder MIB(MF, NewMI); - addOperands(MIB, MOs); - - // Loop over the rest of the ri operands, converting them over. - unsigned NumOps = MI.getDesc().getNumOperands() - 2; - for (unsigned i = 0; i != NumOps; ++i) { - MachineOperand &MO = MI.getOperand(i + 2); - MIB.add(MO); - } - for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - MIB.add(MO); - } - - MachineBasicBlock *MBB = InsertPt->getParent(); - MBB->insert(InsertPt, NewMI); - - return MIB; -} - -static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, - unsigned OpNo, ArrayRef MOs, - MachineBasicBlock::iterator InsertPt, - MachineInstr &MI, const TargetInstrInfo &TII, - int PtrOffset = 0) { - // Omit the implicit operands, something BuildMI can't do. - MachineInstr *NewMI = - MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); - MachineInstrBuilder MIB(MF, NewMI); - - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (i == OpNo) { - assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs, PtrOffset); - } else { - MIB.add(MO); - } - } - - MachineBasicBlock *MBB = InsertPt->getParent(); - MBB->insert(InsertPt, NewMI); - - return MIB; -} - -static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, - ArrayRef MOs, - MachineBasicBlock::iterator InsertPt, - MachineInstr &MI) { - MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, - MI.getDebugLoc(), TII.get(Opcode)); - addOperands(MIB, MOs); - return MIB.addImm(0); -} - -MachineInstr *X86InstrInfo::foldMemoryOperandCustom( - MachineFunction &MF, MachineInstr &MI, unsigned OpNum, - ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align) const { - switch (MI.getOpcode()) { - case X86::INSERTPSrr: - case X86::VINSERTPSrr: - case X86::VINSERTPSZrr: - // Attempt to convert the load of inserted vector into a fold load - // of a single float. - if (OpNum == 2) { - unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); - unsigned ZMask = Imm & 15; - unsigned DstIdx = (Imm >> 4) & 3; - unsigned SrcIdx = (Imm >> 6) & 3; - - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); - unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 4 <= Align) { - int PtrOffset = SrcIdx * 4; - unsigned NewImm = (DstIdx << 4) | ZMask; - unsigned NewOpCode = - (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : - (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : - X86::INSERTPSrm; - MachineInstr *NewMI = - FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); - NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); - return NewMI; - } - } - break; - case X86::MOVHLPSrr: - case X86::VMOVHLPSrr: - case X86::VMOVHLPSZrr: - // Move the upper 64-bits of the second operand to the lower 64-bits. - // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. - // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. - if (OpNum == 2) { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); - unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size <= RCSize && 8 <= Align) { - unsigned NewOpCode = - (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : - (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : - X86::MOVLPSrm; - MachineInstr *NewMI = - FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); - return NewMI; - } - } - break; - }; - - return nullptr; -} - -MachineInstr *X86InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr &MI, unsigned OpNum, - ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align, bool AllowCommute) const { - const DenseMap > *OpcodeTablePtr = nullptr; - bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); - bool isTwoAddrFold = false; - - // For CPUs that favor the register form of a call or push, - // do not fold loads into calls or pushes, unless optimizing for size - // aggressively. - if (isSlowTwoMemOps && !MF.getFunction()->optForMinSize() && - (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || - MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || - MI.getOpcode() == X86::PUSH64r)) - return nullptr; - - unsigned NumOps = MI.getDesc().getNumOperands(); - bool isTwoAddr = - NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // FIXME: AsmPrinter doesn't know how to handle - // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI.getOpcode() == X86::ADD32ri && - MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return nullptr; - - MachineInstr *NewMI = nullptr; - - // Attempt to fold any custom cases we have. - if (MachineInstr *CustomMI = - foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) - return CustomMI; - - // Folding a memory location into the two-address part of a two-address - // instruction is different than folding it other places. It requires - // replacing the *two* registers with the memory location. - if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() && - MI.getOperand(1).isReg() && - MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) { - OpcodeTablePtr = &RegOp2MemOpTable2Addr; - isTwoAddrFold = true; - } else if (OpNum == 0) { - if (MI.getOpcode() == X86::MOV32r0) { - NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); - if (NewMI) - return NewMI; - } - - OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (OpNum == 1) { - OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (OpNum == 2) { - OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (OpNum == 3) { - OpcodeTablePtr = &RegOp2MemOpTable3; - } else if (OpNum == 4) { - OpcodeTablePtr = &RegOp2MemOpTable4; - } - - // If table selected... - if (OpcodeTablePtr) { - // Find the Opcode to fuse - auto I = OpcodeTablePtr->find(MI.getOpcode()); - if (I != OpcodeTablePtr->end()) { - unsigned Opcode = I->second.first; - unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; - if (Align < MinAlign) - return nullptr; - bool NarrowToMOV32rm = false; - if (Size) { - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, - &RI, MF); - unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if (Size < RCSize) { - // Check if it's safe to fold the load. If the size of the object is - // narrower than the load width, then it's not. - if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) - return nullptr; - // If this is a 64-bit load, but the spill slot is 32, then we can do - // a 32-bit load which is implicitly zero-extended. This likely is - // due to live interval analysis remat'ing a load from stack slot. - if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) - return nullptr; - Opcode = X86::MOV32rm; - NarrowToMOV32rm = true; - } - } - - if (isTwoAddrFold) - NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this); - else - NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this); - - if (NarrowToMOV32rm) { - // If this is the special case where we use a MOV32rm to load a 32-bit - // value and zero-extend the top bits. Change the destination register - // to a 32-bit one. - unsigned DstReg = NewMI->getOperand(0).getReg(); - if (TargetRegisterInfo::isPhysicalRegister(DstReg)) - NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); - else - NewMI->getOperand(0).setSubReg(X86::sub_32bit); - } - return NewMI; - } - } - - // If the instruction and target operand are commutable, commute the - // instruction and try again. - if (AllowCommute) { - unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; - if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { - bool HasDef = MI.getDesc().getNumDefs(); - unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0; - unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); - unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); - bool Tied1 = - 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); - bool Tied2 = - 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); - - // If either of the commutable operands are tied to the destination - // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied1) || - (HasDef && Reg0 == Reg2 && Tied2)) - return nullptr; - - MachineInstr *CommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != &MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } - - // Attempt to fold with the commuted version of the instruction. - NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, - Size, Align, /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = - commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != &MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } - - // Return here to prevent duplicate fuse failure report. - return nullptr; - } - } - - // No fusion - if (PrintFailedFusing && !MI.isCopy()) - dbgs() << "We failed to fuse operand " << OpNum << " in " << MI; - return nullptr; -} - /// Return true for all instructions that only update /// the first 32 or 64-bits of the destination register and leave the rest /// unmodified. This can be used to avoid folding loads if the instructions @@ -8494,64 +8174,388 @@ return true; } - return false; + return false; +} + +/// Inform the ExecutionDepsFix pass how many idle instructions we would like +/// before certain undef register reads. +/// +/// This catches the VCVTSI2SD family of instructions: +/// +/// vcvtsi2sdq %rax, %xmm0, %xmm14 +/// +/// We should to be careful *not* to catch VXOR idioms which are presumably +/// handled specially in the pipeline: +/// +/// vxorps %xmm1, %xmm1, %xmm1 +/// +/// Like getPartialRegUpdateClearance, this makes a strong assumption that the +/// high bits that are passed-through are not live. +unsigned +X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI.getOpcode())) + return 0; + + // Set the OpNum parameter to the first source operand. + OpNum = 1; + + const MachineOperand &MO = MI.getOperand(OpNum); + if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + return UndefRegClearance; + } + return 0; +} + +void X86InstrInfo::breakPartialRegDependency( + MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { + unsigned Reg = MI.getOperand(OpNum).getReg(); + // If MI kills this register, the false dependence is already broken. + if (MI.killsRegister(Reg, TRI)) + return; + + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); + } +} + +static void addOperands(MachineInstrBuilder &MIB, ArrayRef MOs, + int PtrOffset = 0) { + unsigned NumAddrOps = MOs.size(); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.add(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.add(MO); + } + } + } +} + +static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, + ArrayRef MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr &MI, + const TargetInstrInfo &TII) { + // Create the base instruction with the memory operand as the first part. + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + addOperands(MIB, MOs); + + // Loop over the rest of the ri operands, converting them over. + unsigned NumOps = MI.getDesc().getNumOperands() - 2; + for (unsigned i = 0; i != NumOps; ++i) { + MachineOperand &MO = MI.getOperand(i + 2); + MIB.add(MO); + } + for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); + MIB.add(MO); + } + + MachineBasicBlock *MBB = InsertPt->getParent(); + MBB->insert(InsertPt, NewMI); + + return MIB; +} + +static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, + unsigned OpNo, ArrayRef MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr &MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { + // Omit the implicit operands, something BuildMI can't do. + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); + MachineInstrBuilder MIB(MF, NewMI); + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); + if (i == OpNo) { + assert(MO.isReg() && "Expected to fold into reg operand!"); + addOperands(MIB, MOs, PtrOffset); + } else { + MIB.add(MO); + } + } + + MachineBasicBlock *MBB = InsertPt->getParent(); + MBB->insert(InsertPt, NewMI); + + return MIB; +} + +static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, + ArrayRef MOs, + MachineBasicBlock::iterator InsertPt, + MachineInstr &MI) { + MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, + MI.getDebugLoc(), TII.get(Opcode)); + addOperands(MIB, MOs); + return MIB.addImm(0); +} + +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr &MI, unsigned OpNum, + ArrayRef MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI.getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + case X86::VINSERTPSZrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm : + (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm : + X86::INSERTPSrm; + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: + // Move the upper 64-bits of the second operand to the lower 64-bits. + // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. + // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. + if (OpNum == 2) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + if (Size <= RCSize && 8 <= Align) { + unsigned NewOpCode = + (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : + (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : + X86::MOVLPSrm; + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); + return NewMI; + } + } + break; + }; + + return nullptr; } -/// Inform the ExecutionDepsFix pass how many idle instructions we would like -/// before certain undef register reads. -/// -/// This catches the VCVTSI2SD family of instructions: -/// -/// vcvtsi2sdq %rax, %xmm0, %xmm14 -/// -/// We should to be careful *not* to catch VXOR idioms which are presumably -/// handled specially in the pipeline: -/// -/// vxorps %xmm1, %xmm1, %xmm1 -/// -/// Like getPartialRegUpdateClearance, this makes a strong assumption that the -/// high bits that are passed-through are not live. -unsigned -X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, - const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI.getOpcode())) - return 0; +MachineInstr *X86InstrInfo::foldMemoryOperandImpl( + MachineFunction &MF, MachineInstr &MI, unsigned OpNum, + ArrayRef MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align, bool AllowCommute) const { + const DenseMap > *OpcodeTablePtr = nullptr; + bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); + bool isTwoAddrFold = false; - // Set the OpNum parameter to the first source operand. - OpNum = 1; + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isSlowTwoMemOps && !MF.getFunction()->optForMinSize() && + (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || + MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || + MI.getOpcode() == X86::PUSH64r)) + return nullptr; - const MachineOperand &MO = MI.getOperand(OpNum); - if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { - return UndefRegClearance; + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + return nullptr; + + unsigned NumOps = MI.getDesc().getNumOperands(); + bool isTwoAddr = + NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI.getOpcode() == X86::ADD32ri && + MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return nullptr; + + MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() && + MI.getOperand(1).isReg() && + MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + isTwoAddrFold = true; + } else if (OpNum == 0) { + if (MI.getOpcode() == X86::MOV32r0) { + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); + if (NewMI) + return NewMI; + } + + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (OpNum == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (OpNum == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (OpNum == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; + } else if (OpNum == 4) { + OpcodeTablePtr = &RegOp2MemOpTable4; } - return 0; -} -void X86InstrInfo::breakPartialRegDependency( - MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - unsigned Reg = MI.getOperand(OpNum).getReg(); - // If MI kills this register, the false dependence is already broken. - if (MI.killsRegister(Reg, TRI)) - return; + // If table selected... + if (OpcodeTablePtr) { + // Find the Opcode to fuse + auto I = OpcodeTablePtr->find(MI.getOpcode()); + if (I != OpcodeTablePtr->end()) { + unsigned Opcode = I->second.first; + unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + if (Align < MinAlign) + return nullptr; + bool NarrowToMOV32rm = false; + if (Size) { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, + &RI, MF); + unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; + if (Size < RCSize) { + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return nullptr; + // If this is a 64-bit load, but the spill slot is 32, then we can do + // a 32-bit load which is implicitly zero-extended. This likely is + // due to live interval analysis remat'ing a load from stack slot. + if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) + return nullptr; + Opcode = X86::MOV32rm; + NarrowToMOV32rm = true; + } + } - if (X86::VR128RegClass.contains(Reg)) { - // These instructions are all floating point domain, so xorps is the best - // choice. - unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); - MI.addRegisterKilled(Reg, TRI, true); - } else if (X86::VR256RegClass.contains(Reg)) { - // Use vxorps to clear the full ymm register. - // It wants to read and write the xmm sub-register. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) - .addReg(XReg, RegState::Undef) - .addReg(XReg, RegState::Undef) - .addReg(Reg, RegState::ImplicitDefine); - MI.addRegisterKilled(Reg, TRI, true); + if (isTwoAddrFold) + NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this); + else + NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this); + + if (NarrowToMOV32rm) { + // If this is the special case where we use a MOV32rm to load a 32-bit + // value and zero-extend the top bits. Change the destination register + // to a 32-bit one. + unsigned DstReg = NewMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); + else + NewMI->getOperand(0).setSubReg(X86::sub_32bit); + } + return NewMI; + } + } + + // If the instruction and target operand are commutable, commute the + // instruction and try again. + if (AllowCommute) { + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI.getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0; + unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); + bool Tied1 = + 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = + 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination + // then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) + return nullptr; + + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != &MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != &MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } } + + // No fusion + if (PrintFailedFusing && !MI.isCopy()) + dbgs() << "We failed to fuse operand " << OpNum << " in " << MI; + return nullptr; } MachineInstr * @@ -8563,11 +8567,6 @@ if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) - return nullptr; - // Don't fold subreg spills, or reloads that use a high subreg. for (auto Op : Ops) { MachineOperand &MO = MI.getOperand(Op); @@ -8762,10 +8761,6 @@ // Check switch flag if (NoFusing) return nullptr; - // Avoid partial register update stalls unless optimizing for size. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) - return nullptr; - // Determine the alignment of the load. unsigned Alignment = 0; if (LoadMI.hasOneMemOperand()) Index: test/CodeGen/X86/fast-isel-fptrunc-fpext.ll =================================================================== --- test/CodeGen/X86/fast-isel-fptrunc-fpext.ll +++ test/CodeGen/X86/fast-isel-fptrunc-fpext.ll @@ -54,7 +54,8 @@ define double @single_to_double_rm(float* %x) { ; SSE-LABEL: single_to_double_rm: ; SSE: # BB#0: # %entry -; SSE-NEXT: cvtss2sd (%rdi), %xmm0 +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: cvtss2sd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: single_to_double_rm: @@ -71,7 +72,8 @@ define float @double_to_single_rm(double* %x) { ; SSE-LABEL: double_to_single_rm: ; SSE: # BB#0: # %entry -; SSE-NEXT: cvtsd2ss (%rdi), %xmm0 +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: cvtsd2ss %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: double_to_single_rm: Index: test/CodeGen/X86/fast-isel-int-float-conversion.ll =================================================================== --- test/CodeGen/X86/fast-isel-int-float-conversion.ll +++ test/CodeGen/X86/fast-isel-int-float-conversion.ll @@ -21,7 +21,8 @@ define double @int_to_double_rm(i32* %a) { ; SSE2-LABEL: int_to_double_rm: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: cvtsi2sdl (%rdi), %xmm0 +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: cvtsi2sdl %eax, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: int_to_double_rm: @@ -52,7 +53,8 @@ define float @int_to_float_rm(i32* %a) { ; SSE2-LABEL: int_to_float_rm: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: cvtsi2ssl (%rdi), %xmm0 +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: cvtsi2ssl %eax, %xmm0 ; SSE2-NEXT: retq ; ; AVX-LABEL: int_to_float_rm: