Index: llvm/include/llvm/Target/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/Target/TargetInstrInfo.h +++ llvm/include/llvm/Target/TargetInstrInfo.h @@ -94,6 +94,20 @@ return false; } + /// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable + /// operand indices to (ResultIdx1, ResultIdx2). + /// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be + /// predefined to some indices or be undefined (designated by ~0U value). + /// The predefined result indices cannot be re-defined. + /// The function returns true iff after the result pair redefinition + /// the fixed result pair is equal to or equivalent to the source pair of + /// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that + /// the pairs (x,y) and (y,x) are equivalent. + virtual bool fixCommutedOpIndices(unsigned &ResultIdx1, + unsigned &ResultIdx2, + unsigned CommutableOpIdx1, + unsigned CommutableOpIdx2) const; + private: /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is /// set and the target hook isReallyTriviallyReMaterializable returns false, @@ -255,18 +269,48 @@ /// commute them, this method can overloaded to do that. /// The default implementation simply swaps the commutable operands. /// If NewMI is false, MI is modified in place and returned; otherwise, a - /// new machine instruction is created and returned. Do not call this - /// method for a non-commutable instruction, but there may be some cases - /// where this method fails and returns null. + /// new machine instruction is created and returned. + /// + /// The overloaded version of the method with the indices of the + /// commuted operands may be used when the commuted instruction has + /// more than two operands and thus, there may be preferences in what + /// operand must be commuted. + /// + /// Do not call these methods for a non-commutable instruction. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI = false) const; - - /// If specified MI is commutable, return the two operand indices that would - /// swap value. Return false if the instruction - /// is not in a form which this routine understands. + virtual MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI, + unsigned Idx1, + unsigned Idx2) const; + + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value ~0U + /// assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. + /// + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = ~0U; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. + /// virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const; + /// Returns true if the specified MI is commutable and the operands with + /// indices SrcOpIdx1 and SrcOpIdx2 can swap their values. + /// Otherwise, returns false. + virtual bool areOpsCommutable(MachineInstr *MI, unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + /// A pair composed of a register and a sub-register index. /// Used to give some type checking when modeling Reg:SubReg. struct RegSubRegPair { Index: llvm/lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- llvm/lib/CodeGen/RegisterCoalescer.cpp +++ llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -679,14 +679,19 @@ unsigned UseOpIdx; if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx)) return false; - unsigned Op1, Op2, NewDstIdx; - if (!TII->findCommutedOpIndices(DefMI, Op1, Op2)) - return false; - if (Op1 == UseOpIdx) - NewDstIdx = Op2; - else if (Op2 == UseOpIdx) - NewDstIdx = Op1; - else + + // + // FIXME: The code below tries to commute 'UseOpIdx' operand with some other + // commutable operand which is expressed by ~0U value passed to the method. + // That _other_ operand is chosen by the findCommutedOpIndices() method. + // + // That is obviously an area for improvement in case of instructions having + // more than 2 operands. For example, if some instruction has 3 commutable + // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3, + // op#2<->op#3) of commute transformation should be considered/tried here. + // + unsigned NewDstIdx = ~0U; + if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx)) return false; MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx); @@ -719,7 +724,8 @@ // At this point we have decided that it is legal to do this // transformation. Start by commuting the instruction. MachineBasicBlock *MBB = DefMI->getParent(); - MachineInstr *NewMI = TII->commuteInstruction(DefMI); + MachineInstr *NewMI = TII->commuteInstruction(DefMI, false, + UseOpIdx, NewDstIdx); if (!NewMI) return false; if (TargetRegisterInfo::isVirtualRegister(IntA.reg) && Index: llvm/lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- llvm/lib/CodeGen/TargetInstrInfo.cpp +++ llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -118,23 +118,37 @@ MBB->addSuccessor(NewDest); } -// commuteInstruction - The default implementation of this method just exchanges -// the two operands returned by findCommutedOpIndices. +/// commuteInstruction - If a target has any instructions that are +/// commutable but require converting to different instructions or making +/// non-trivial changes to commute them, these methods can be overloaded to +/// do that. The default implementations simply swap the commutable +/// operands. +/// +/// If NewMI is false, MI is modified in place and returned; otherwise, a +/// new machine instruction is created and returned. +/// +/// The passed operand indices are used to tell what operands must +/// be commuted. +/// +/// Do not call this method for a non-commutable instruction. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +/// MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { + bool NewMI, + unsigned Idx1, + unsigned Idx2) const { const MCInstrDesc &MCID = MI->getDesc(); bool HasDef = MCID.getNumDefs(); if (HasDef && !MI->getOperand(0).isReg()) // No idea how to commute this instruction. Target should implement its own. return nullptr; - unsigned Idx1, Idx2; - if (!findCommutedOpIndices(MI, Idx1, Idx2)) { - assert(MI->isCommutable() && "Precondition violation: MI must be commutable."); - return nullptr; - } + assert(areOpsCommutable(MI, Idx1, Idx2) && + "TargetInstrInfo::CommuteInstruction(): not commutable operands."); assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() && "This only knows how to commute register operands so far"); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(Idx1).getReg(); unsigned Reg2 = MI->getOperand(Idx2).getReg(); @@ -184,9 +198,83 @@ return MI; } -/// findCommutedOpIndices - If specified MI is commutable, return the two -/// operand indices that would swap value. Return true if the instruction -/// is not in a form which this routine understands. +/// The default implementation of this method just exchanges the two operands. +/// This method is used when the caller does not care about what operands +/// should be commuted or when there is only one way of doing operands commute +/// transformation, for example, when the commuted instruction has only +/// 2 operands. +/// +MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI) const { + unsigned OpIdx1 = ~0U, OpIdx2 = ~0U; + + if (!findCommutedOpIndices(MI, OpIdx1, OpIdx2)) { + assert(MI->isCommutable() && + "Precondition violation: MI must be commutable."); + return nullptr; + } + return commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); +} + +/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable +/// operand indices to (ResultIdx1, ResultIdx2). +/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be +/// predefined to some indices or be undefined (designated by ~0U value). +/// The predefined result indices cannot be re-defined. +/// The function returns true iff after the result pair redefinition +/// the fixed result pair is equal to or equivalent to the source pair of +/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that +/// the pairs (x,y) and (y,x) are equivalent. +/// +bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1, + unsigned &ResultIdx2, + unsigned CommutableOpIdx1, + unsigned CommutableOpIdx2) const { + if (ResultIdx1 == ~0U && ResultIdx2 == ~0U) { + ResultIdx1 = CommutableOpIdx1; + ResultIdx2 = CommutableOpIdx2; + } + else if (ResultIdx1 == ~0U) { + if (ResultIdx2 == CommutableOpIdx1) + ResultIdx1 = CommutableOpIdx2; + else if (ResultIdx2 == CommutableOpIdx2) + ResultIdx1 = CommutableOpIdx1; + else + return false; + } + else if (ResultIdx2 == ~0U) { + if (ResultIdx1 == CommutableOpIdx1) + ResultIdx2 = CommutableOpIdx2; + else if (ResultIdx1 == CommutableOpIdx2) + ResultIdx2 = CommutableOpIdx1; + else + return false; + } + else + // Check that the result operand indices match the given commutable + // operand indices. + return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) || + (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1); + + return true; +} + +/// Returns true iff the routine could find two commutable operands in the +/// given machine instruction. +/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their +/// input values can be re-defined in this method only if the input values +/// are not pre-defined, which is designated by the special value ~0U +/// assigned to it. +/// If both of indices are pre-defined and refer to some operands, then the +/// method simply returns true if the corresponding operands are commutable +/// and returns false otherwise. +/// +/// For example, calling this method this way: +/// unsigned Op1 = 1, Op2 = ~0U; +/// findCommutedOpIndices(MI, Op1, Op2); +/// can be interpreted as a query asking to find an operand that would be +/// commutable with the operand#1. +/// bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { @@ -196,10 +284,15 @@ const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; + // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this // is not true, then the target must implement this. - SrcOpIdx1 = MCID.getNumDefs(); - SrcOpIdx2 = SrcOpIdx1 + 1; + unsigned CommutableOpIdx1 = MCID.getNumDefs(); + unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1; + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + if (!MI->getOperand(SrcOpIdx1).isReg() || !MI->getOperand(SrcOpIdx2).isReg()) // No idea. @@ -207,6 +300,18 @@ return true; } +/// Returns true if the specified MI is commutable and the operands with +/// indices SrcOpIdx1 and SrcOpIdx2 can swap their values. +/// Otherwise, returns false. +bool TargetInstrInfo::areOpsCommutable(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned OpsNum = MI->getNumOperands(); + assert(SrcOpIdx1 < OpsNum && SrcOpIdx2 < OpsNum && + "TargetInstrInfo::areOpsCommutable() illegal operand index."); + + return findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); +} bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -110,8 +110,8 @@ bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, MachineInstr *MI, unsigned Dist); - bool commuteInstruction(MachineBasicBlock::iterator &mi, - unsigned RegB, unsigned RegC, unsigned Dist); + bool commuteInstruction(MachineInstr *MI, + unsigned RegBIdx, unsigned RegCIdx, unsigned Dist); bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB); @@ -133,6 +133,11 @@ unsigned SrcIdx, unsigned DstIdx, unsigned Dist, bool shouldOnlyCommute); + bool tryInstructionCommute(MachineInstr *MI, + unsigned DstOpIdx, + unsigned BaseOpIdx, + bool BaseOpKilled, + unsigned Dist); void scanUses(unsigned DstReg); void processCopy(MachineInstr *MI); @@ -646,11 +651,11 @@ /// block, distance map, and live variables if needed. Return true if it is /// successful. bool TwoAddressInstructionPass:: -commuteInstruction(MachineBasicBlock::iterator &mi, - unsigned RegB, unsigned RegC, unsigned Dist) { - MachineInstr *MI = mi; +commuteInstruction(MachineInstr *MI, + unsigned RegBIdx, unsigned RegCIdx, unsigned Dist) { + unsigned RegC = MI->getOperand(RegCIdx).getReg(); DEBUG(dbgs() << "2addr: COMMUTING : " << *MI); - MachineInstr *NewMI = TII->commuteInstruction(MI); + MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx); if (NewMI == nullptr) { DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n"); @@ -1155,6 +1160,51 @@ return true; } +/// Tries to commute the operand BaseOpIdx and some other operand in the given +/// machine instruction to improve opportunities for coalescing and elimination +/// of a register to register copy. +/// Returns true if the transformation happened. Otherwise, returns false. +/// +bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, + unsigned DstOpIdx, + unsigned BaseOpIdx, + bool BaseOpKilled, + unsigned Dist) { + unsigned OtherOpIdx = MI->getDesc().getNumDefs(); + for (; OtherOpIdx < MI->getDesc().getNumOperands(); OtherOpIdx++) { + if (OtherOpIdx != BaseOpIdx && + TII->areOpsCommutable(MI, BaseOpIdx, OtherOpIdx)) { + + unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg(); + unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg(); + unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg(); + bool AggressiveCommute = false; + + // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp + // operands. This makes the live ranges of DstOp and OtherOp joinable. + bool DoCommute = + !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false); + + if (!DoCommute && + isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) { + DoCommute = true; + AggressiveCommute = true; + } + + // + // If it's profitable to commute, try to do so. + // + if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) { + ++NumCommuted; + if (AggressiveCommute) + ++NumAggrCommuted; + return true; + } + } + } + return false; +} + /// tryInstructionTransform - For the case where an instruction has a single /// pair of tied register operands, attempt some transformations that may /// either eliminate the tied operands or improve the opportunities for @@ -1181,31 +1231,7 @@ if (TargetRegisterInfo::isVirtualRegister(regA)) scanUses(regA); - // Check if it is profitable to commute the operands. - unsigned SrcOp1, SrcOp2; - unsigned regC = 0; - unsigned regCIdx = ~0U; - bool TryCommute = false; - bool AggressiveCommute = false; - if (MI.isCommutable() && MI.getNumOperands() >= 3 && - TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) { - if (SrcIdx == SrcOp1) - regCIdx = SrcOp2; - else if (SrcIdx == SrcOp2) - regCIdx = SrcOp1; - - if (regCIdx != ~0U) { - regC = MI.getOperand(regCIdx).getReg(); - if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false)) - // If C dies but B does not, swap the B and C operands. - // This makes the live ranges of A and C joinable. - TryCommute = true; - else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) { - TryCommute = true; - AggressiveCommute = true; - } - } - } + bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist); // If the instruction is convertible to 3 Addr, instead // of returning try 3 Addr transformation aggresively and @@ -1215,17 +1241,8 @@ // addl %esi, %edi // movl %edi, %eax // ret - bool Commuted = false; - - // If it's profitable to commute, try to do so. - if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) { - Commuted = true; - ++NumCommuted; - if (AggressiveCommute) - ++NumAggrCommuted; - if (!MI.isConvertibleTo3Addr()) - return false; - } + if (Commuted && !MI.isConvertibleTo3Addr()) + return false; if (shouldOnlyCommute) return false; Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -164,8 +164,8 @@ // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; + unsigned CommuteIdx0 = ~0U; + unsigned CommuteIdx1 = ~0U; bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { @@ -175,7 +175,13 @@ OpNo = CommuteIdx0; } - if (!CanCommute || !TII->commuteInstruction(MI)) + // FIXME: OpNo can be commuted with non-reg operand OtherOpNo, but + // such test cases are not handled well yet. + if (CanCommute && + (!MI->getOperand(CommuteIdx0).isReg() || !MI->getOperand(CommuteIdx1).isReg())) + return false; + + if (!CanCommute || !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) return false; if (!TII->isOperandLegal(MI, OpNo, OpToFold)) Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -120,7 +120,10 @@ int commuteOpcode(const MachineInstr &MI) const; MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const override; + bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -764,8 +764,18 @@ return true; } +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx1 and OpIdx2. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx1 and OpIdx2. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +/// MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const { if (MI->getNumOperands() < 3) return nullptr; @@ -784,7 +794,12 @@ int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1); - if (Src1Idx == -1) + assert(Src1Idx != -1 && "Should always have src1 operand"); + + if (!(OpIdx0 == static_cast(Src0Idx) && + OpIdx1 == static_cast(Src1Idx)) && + !(OpIdx0 == static_cast(Src1Idx) && + OpIdx1 == static_cast(Src0Idx))) return nullptr; MachineOperand &Src1 = MI->getOperand(Src1Idx); @@ -832,7 +847,7 @@ Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx0, OpIdx1); } if (MI) @@ -845,8 +860,8 @@ // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { + unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; @@ -857,7 +872,8 @@ return false; // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. + // immediate. Also, immeditate src0 operand is not handled in + // SIInstrInfo::commuteInstruction(); if (!MI->getOperand(Src0Idx).isReg()) return false; @@ -865,18 +881,24 @@ if (Src1Idx == -1) return false; - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + MachineOperand &Src1 = MI->getOperand(Src1Idx); + if (Src1.isImm()) { + // SIInstrInfo::commuteInstruction() does support commuting the immediate + // operand src1 in 2 and 3 operand instructions. + if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + return false; + } + else if (Src1.isReg()) { + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + } + else return false; - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; + return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, @@ -1723,7 +1745,7 @@ // than src1, so try to commute the instruction to decrease our // chances of having to insert a MOV instruction to legalize src1. if (MI->isCommutable()) { - if (commuteInstruction(MI)) + if (TargetInstrInfo::commuteInstruction(MI)) // If we are successful in commuting, then we know MI is legal, so // we are done. return; Index: llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -182,7 +182,8 @@ } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + if (TryToCommute && MI.isCommutable() && + TII->TargetInstrInfo::commuteInstruction(&MI)) foldImmediates(MI, TII, MRI, false); } @@ -221,7 +222,8 @@ if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + if (!MI.isCommutable() || + !TII->TargetInstrInfo::commuteInstruction(&MI) || !canShrink(MI, TII, TRI, MRI)) continue; } Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -188,8 +188,10 @@ MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const override; - MachineInstr *commuteInstruction(MachineInstr*, - bool=false) const override; + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1744,9 +1744,17 @@ llvm_unreachable("Unknown unconditional branch opcode!"); } -/// commuteInstruction - Handle commutable instructions. +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx1 and OpIdx2. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx1 and OpIdx2. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +/// MachineInstr * -ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI, + unsigned OpIdx1, unsigned OpIdx2) const { switch (MI->getOpcode()) { case ARM::MOVCCr: case ARM::t2MOVCCr: { @@ -1756,7 +1764,7 @@ // MOVCC AL can't be inverted. Shouldn't happen. if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); if (!MI) return nullptr; // After swapping the MOVCC operands, also invert the condition. @@ -1765,7 +1773,7 @@ return MI; } } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); } /// Identify instructions that can be folded into a MOVCC instruction, and Index: llvm/lib/Target/ARM/Thumb2SizeReduction.cpp =================================================================== --- llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -654,17 +654,18 @@ if (Reg1 != Reg0) return false; // Try to commute the operands to make it a 2-address instruction. - MachineInstr *CommutedMI = TII->commuteInstruction(MI); + MachineInstr *CommutedMI = TII->TargetInstrInfo::commuteInstruction(MI); if (!CommutedMI) return false; } } else if (Reg0 != Reg1) { // Try to commute the operands to make it a 2-address instruction. - unsigned CommOpIdx1, CommOpIdx2; + unsigned CommOpIdx1 = 1, CommOpIdx2 = ~0U; if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || - CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) + MI->getOperand(CommOpIdx2).getReg() != Reg0) return false; - MachineInstr *CommutedMI = TII->commuteInstruction(MI); + MachineInstr *CommutedMI = TII->commuteInstruction(MI, false, + CommOpIdx1, CommOpIdx2); if (!CommutedMI) return false; } Index: llvm/lib/Target/PowerPC/PPCInstrInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -159,9 +159,21 @@ unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - // commuteInstruction - We can commute rlwimi instructions, but only if the - // rotate amt is zero. We also have to munge the immediates a bit. - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + /// + /// For example, we can commute rlwimi instructions, but only if the + /// rotate amt is zero. We also have to munge the immediates a bit. + /// + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; Index: llvm/lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -521,16 +521,26 @@ return 0; } -// commuteInstruction - We can commute rlwimi instructions, but only if the -// rotate amt is zero. We also have to munge the immediates a bit. +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx1 and OpIdx2. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx1 and OpIdx2. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +/// +/// For example, we can commute rlwimi instructions, but only if the +/// rotate amt is zero. We also have to munge the immediates a bit. +/// MachineInstr * -PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI, + unsigned OpIdx1, unsigned OpIdx2) const { MachineFunction &MF = *MI->getParent()->getParent(); // Normal instructions can be commuted the obvious way. if (MI->getOpcode() != PPC::RLWIMI && MI->getOpcode() != PPC::RLWIMIo) - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because // changing the relative order of the mask operands might change what happens @@ -548,6 +558,8 @@ // Op0 = (Op2 & ~M) | (Op1 & M) // Swap op1/op2 + assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && + "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); unsigned Reg0 = MI->getOperand(0).getReg(); unsigned Reg1 = MI->getOperand(1).getReg(); unsigned Reg2 = MI->getOperand(2).getReg(); @@ -610,9 +622,9 @@ if (AltOpc == -1) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; + // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1 + // and SrcOpIdx2. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); } void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, Index: llvm/lib/Target/X86/X86InstrFMA.td =================================================================== --- llvm/lib/Target/X86/X86InstrFMA.td +++ llvm/lib/Target/X86/X86InstrFMA.td @@ -60,27 +60,47 @@ string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - // For 213, both the register and memory variant are commutable. - // Indeed, the commutable operands are 1 and 2 and both live in registers - // for both variants. +let hasSideEffects = 0 in { + // For 213, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); + // operands 1 and 3 (register forms only): *213* --> *231*; + // operands 2 and 3 (register forms only): *213* --> *132*. defm r213 : fma3p_rm; -let hasSideEffects = 0 in { + // For 132, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *132* --> *231*; + // operands 1 and 3 (register forms only): *132* --> *132*(no changes); + // operands 2 and 3 (register forms only): *132* --> *213*. defm r132 : fma3p_rm; - // For 231, only the register variant is commutable. + MemFrag128, MemFrag256, OpTy128, OpTy256, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1>; + // For 231, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. // For the memory variant the folded operand must be in 3. Thus, - // in that case, it cannot be swapped with 2. + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *231* --> *132*; + // operands 1 and 3 (register forms only): *231* --> *213*; + // operands 2 and 3 (register forms only): *231* --> *231*(no changes). defm r231 : fma3p_rm; + /* IsMVariantCommutable */ 1>; } // hasSideEffects = 0 } @@ -156,23 +176,54 @@ X86MemOperand x86memop, Operand memop, PatFrag mem_frag, ComplexPattern mem_cpat> { let hasSideEffects = 0 in { + // For 132, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *132* --> *231*; + // operands 1 and 3 (register forms only): *132* --> *132*(no changes); + // operands 2 and 3 (register forms only): *132* --> *213*. + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. defm r132 : fma3s_rm; - // See the other defm of r231 for the explanation regarding the - // commutable flags. + x86memop, RC, OpVT, mem_frag, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1>; + // For 231, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *231* --> *132*; + // operands 1 and 3 (register forms only): *231* --> *213*; + // operands 2 and 3 (register forms only): *231* --> *231*(no changes). + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. defm r231 : fma3s_rm; -} + /* IsMVariantCommutable */ 1>; -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm; + // For 213, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); + // operands 1 and 3 (register forms only): *213* --> *231*; + // operands 2 and 3 (register forms only): *213* --> *132*. + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. + defm r213 : fma3s_rm; +} } multiclass fma3s opc132, bits<8> opc213, bits<8> opc231, Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -259,14 +259,85 @@ MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; - /// commuteInstruction - We have a few instructions that must be hacked on to - /// commute them. + /// Commutes the operands in the given instruction by changing the operands + /// order and/or changing the instruction's opcode and/or the immediate value + /// operand. + /// + /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands + /// to be commuted. + /// + /// Do not call this method for a non-commutable instruction. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + /// + MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI, + unsigned CommuteOpIdx1, + unsigned CommuteOpIdx2) const override; + + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value ~0U + /// assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. + /// + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = ~0U; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. /// - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value (~0U) which means that the corresponding operand index is not set + /// and this method is free to pick any of available commutable operands. + /// + /// For example, calling this method this way: + /// findFMA3CommutedOpIndices(MI, 1, ~0U); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + /// + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// SrcOpIdx1 and SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + /// + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + + /// Returns true if the given instruction opcode is FMA3. + /// Otherwise, returns false. + /// + bool isFMA3(unsigned Opcode) const; + // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2923,10 +2923,21 @@ return NewMI; } -/// We have a few instructions that must be hacked on to commute them. +/// Commutes the operands in the given instruction by changing the operands +/// order and/or changing the instruction's opcode and/or the immediate value +/// operand. +/// The arguments 'OpIdx1' and 'OpIdx2' specify the operands to be commuted. +/// +/// Do not call this method for a non-commutable instruction. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. /// MachineInstr * -X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { + + switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) @@ -2953,7 +2964,7 @@ } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -2989,7 +3000,7 @@ NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ @@ -3004,7 +3015,7 @@ NewMI = false; } MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3025,7 +3036,7 @@ MI = MF.CloneMachineInstr(MI); NewMI = false; } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; } @@ -3054,7 +3065,7 @@ NewMI = false; } MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3133,11 +3144,407 @@ // Fallthrough intended. } default: - return TargetInstrInfo::commuteInstruction(MI, NewMI); + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) { + return nullptr; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); + } + return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2); + } +} + +/// +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// +bool X86InstrInfo::isFMA3(unsigned Opcode) const { + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + default: + break; + } + return false; +} + +/// +/// Returns true if the routine could find two commutable operands +/// in the given FMA instruction. Otherwise, returns false. +/// +/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. +/// The output indices of the commuted operands are returned in these +/// arguments. Also, the input values of these arguments may be preset either +/// to indices of operands that must be commuted or be equal to a special +/// value (~0U) which means that the corresponding operand index is not set +/// and this method is free to pick any of available commutable operands. +/// +/// For example, calling this method this way: +/// findFMA3CommutedOpIndices(MI, 1, ~0U); +/// can be interpreted as a query asking if the operand #1 can be swapped +/// with any other available operand (e.g. operand #2, operand #3, etc.). +/// +/// The returned FMA opcode may differ from the opcode in the given MI. +/// For example, commuting the operands #1 and #3 in the following FMA +/// FMA213 #1, #2, #3 +/// results into instruction with adjusted opcode: +/// FMA231 #3, #2, #1 +/// +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // + // Only the first RegOpsNum operands are commutable. + // Also, the value ~0U is valid here as it means that the operand is not + // specified/fixed. + // + if (SrcOpIdx1 < 1 || (SrcOpIdx1 > RegOpsNum && SrcOpIdx1 != ~0U) || + SrcOpIdx2 < 1 || (SrcOpIdx2 > RegOpsNum && SrcOpIdx2 != ~0U)) { + return false; } + + if (SrcOpIdx1 == ~0U || SrcOpIdx2 == ~0U) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + // + if (SrcOpIdx1 == SrcOpIdx2) { + // Both of operands are not fixed. By default set one of commutable + // operands to the last operand of the instruction. + // + CommutableOpIdx2 = RegOpsNum; + } + else if (SrcOpIdx2 == ~0U) { + // Only one of operands is not fixed. + // + CommutableOpIdx2 = SrcOpIdx1; + } + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + // + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + // + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + // + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, +/// +/// Returns an adjusted FMA opcode that must be used in FMA instruction that +/// performs the same computations as the given MI but which has the operands +/// SrcOpIdx1 and SrcOpIdx2 commuted. +/// It may return 0 if it is unsafe to commute the operands. +/// +/// The returned FMA opcode may differ from the opcode in the given MI. +/// For example, commuting the operands #1 and #3 in the following FMA +/// FMA213 #1, #2, #3 +/// results into instruction with adjusted opcode: +/// FMA231 #3, #2, #1 +/// +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + int RetOpc = 0; + int Opc = MI->getOpcode(); + + // + // Struct which describes FMA opcodes and dependencies between them. + // + static const struct { + int Opc1; + int Opc2; + int Opc3; + bool IsScalar; + } OpcodeAlts[] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r, true }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r, true }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r, false }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r, false }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY,false }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY,false }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m, true }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m, true }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m, false }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m, false }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY,false }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY,false }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r, true }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r, true }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r, false }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r, false }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY,false }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY,false }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m, true }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m, true }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m, false }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m, false }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY,false }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY,false }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r, true }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r, true }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r, false }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r, false }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY,false }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY,false }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m, true }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m, true }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m, false }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m, false }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY,false }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY,false }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r, true }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r, true }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r, false }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r, false }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY,false }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY,false }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m, true }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m, true }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m, false }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m, false }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY,false }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY,false }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r, false }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r, false }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY,false }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY,false }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m, false }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m, false }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY,false }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY,false }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r, false }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r, false }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY,false }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY,false }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m, false }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m, false }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY,false }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY,false } + }; + + unsigned char OpcodeAltsNum = sizeof(OpcodeAlts) / sizeof(OpcodeAlts[0]); + int i, pos = 0; + for (i = 0; i < OpcodeAltsNum; i++) { + if (OpcodeAlts[i].Opc2 == Opc) { + pos = 2; + break; + } + if (OpcodeAlts[i].Opc1 == Opc) { + pos = 1; + break; + } + if (OpcodeAlts[i].Opc3 == Opc) { + pos = 3; + break; + } + } + + // + // Input opcode does not match with any from the table. + // + if (pos == 0) + return 0; + + // FIXME: Commuting the 1st operand of scalar FMA requires some additional + // analysis such as getting proof of the fact that all uses of the + // given FMA instruction use only the lowest element. Without proving + // that commuting the 1st operand of scalar FMAs changes the upper bits + // of the result. + // + if (OpcodeAlts[i].IsScalar && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) + return 0; + + // + // Find reversed FMA opcode. + // + if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 2) || + (SrcOpIdx1 == 2 && SrcOpIdx2 == 1)) { + if (pos == 1) + RetOpc = OpcodeAlts[i].Opc3; + else if (pos == 2) + RetOpc = Opc; + else + RetOpc = OpcodeAlts[i].Opc1; + } + else if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 3) || + (SrcOpIdx1 == 3 && SrcOpIdx2 == 1)) { + if (pos == 1) + RetOpc = Opc; + else if (pos == 2) + RetOpc = OpcodeAlts[i].Opc3; + else + RetOpc = OpcodeAlts[i].Opc2; + } + else if ((SrcOpIdx1 == 2 && SrcOpIdx2 == 3) || + (SrcOpIdx1 == 3 && SrcOpIdx2 == 2)) { + if (pos == 1) + RetOpc = OpcodeAlts[i].Opc2; + else if (pos == 2) + RetOpc = OpcodeAlts[i].Opc1; + else + RetOpc = Opc; + } + + return RetOpc; +} + +/// Returns true iff the routine could find two commutable operands in the +/// given machine instruction. +/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their +/// input values can be re-defined in this method only if the input values +/// are not pre-defined, which is designated by the special value ~0U +/// assigned to it. +/// If both of indices are pre-defined and refer to some operands, then the +/// method simply returns true if the corresponding operands are commutable +/// and returns false otherwise. +/// +/// For example, calling this method this way: +/// unsigned Op1 = 1, Op2 = ~0U; +/// findCommutedOpIndices(MI, Op1, Op2); +/// can be interpreted as a query asking to find an operand that would be +/// commutable with the operand#1. +/// +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::CMPPDrri: @@ -3150,46 +3557,24 @@ // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI->getOperand(3).getImm() & 0x7; switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; default: + if (isFMA3(MI->getOpcode())) { + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + } return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } + + return false; } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { @@ -4972,60 +5357,58 @@ // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = ~0U; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); - bool Tied0 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied0) || - (HasDef && Reg0 == Reg2 && Tied1)) + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; - if ((CommuteOpIdx1 == OriginalOpIdx) || - (CommuteOpIdx2 == OriginalOpIdx)) { - MachineInstr *CommutedMI = commuteInstruction(MI, false); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } + MachineInstr *CommutedMI = commuteInstruction(MI, false, + CommuteOpIdx1, + CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } - // Attempt to fold with the commuted version of the instruction. - unsigned CommuteOp = - (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); - NewMI = - foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align, - /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = commuteInstruction(MI, false); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; - // Return here to prevent duplicate fuse failure report. + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = commuteInstruction(MI, false, + CommuteOpIdx1, + CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); return nullptr; } + + // Return here to prevent duplicate fuse failure report. + return nullptr; } } Index: llvm/test/CodeGen/X86/fma-commute-x86.ll =================================================================== --- llvm/test/CodeGen/X86/fma-commute-x86.ll +++ llvm/test/CodeGen/X86/fma-commute-x86.ll @@ -0,0 +1,312 @@ +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s +; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s + +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmadd132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmadd231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmadd213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmadd132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmadd231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmadd213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmadd132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmadd231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmadd213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmadd132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmadd231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmadd213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + + +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmadd132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmadd231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmadd213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmadd132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmadd231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmadd213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmadd132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmadd231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmadd213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmadd132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmadd231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmadd213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmsub132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmsub231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fmsub213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmsub132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmsub231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fmsub213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmsub132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmsub231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fmsub213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmsub132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmsub231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fmsub213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmsub132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmsub231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) { + ; CHECK: fnmsub213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmsub132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmsub231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) { + ; CHECK: fnmsub213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmsub132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmsub231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) { + ; CHECK: fnmsub213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmsub132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmsub231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) { + ; CHECK: fnmsub213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + Index: llvm/test/CodeGen/X86/fma_patterns.ll =================================================================== --- llvm/test/CodeGen/X86/fma_patterns.ll +++ llvm/test/CodeGen/X86/fma_patterns.ll @@ -134,7 +134,7 @@ } ; CHECK: test_x86_fnmadd_ss -; CHECK: vfnmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1 ; CHECK: ret ; CHECK_FMA4: test_x86_fnmadd_ss ; CHECK_FMA4: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 @@ -146,7 +146,7 @@ } ; CHECK: test_x86_fnmadd_sd -; CHECK: vfnmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1 ; CHECK: ret ; CHECK_FMA4: test_x86_fnmadd_sd ; CHECK_FMA4: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 @@ -158,7 +158,7 @@ } ; CHECK: test_x86_fmsub_sd -; CHECK: vfmsub213sd %xmm2, %xmm1, %xmm0 +; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1 ; CHECK: ret ; CHECK_FMA4: test_x86_fmsub_sd ; CHECK_FMA4: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 @@ -170,7 +170,7 @@ } ; CHECK: test_x86_fnmsub_ss -; CHECK: vfnmsub213ss %xmm2, %xmm1, %xmm0 +; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1 ; CHECK: ret ; CHECK_FMA4: test_x86_fnmsub_ss ; CHECK_FMA4: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 @@ -183,8 +183,7 @@ } ; CHECK: test_x86_fmadd_ps_load -; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: vfmadd213ps %xmm1, %xmm2, %xmm0 +; CHECK: vfmadd132ps (%rdi), %xmm1, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmadd_ps_load ; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 @@ -197,8 +196,7 @@ } ; CHECK: test_x86_fmsub_ps_load -; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: fmsub213ps %xmm1, %xmm2, %xmm0 +; CHECK: vfmsub132ps (%rdi), %xmm1, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmsub_ps_load ; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0