Index: llvm/include/llvm/Target/TargetInstrInfo.h
===================================================================
--- llvm/include/llvm/Target/TargetInstrInfo.h
+++ llvm/include/llvm/Target/TargetInstrInfo.h
@@ -94,6 +94,20 @@
     return false;
   }
 
+  /// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
+  /// operand indices to (ResultIdx1, ResultIdx2).
+  /// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
+  /// predefined to some indices or be undefined (designated by ~0U value).
+  /// The predefined result indices cannot be re-defined.
+  /// The function returns true iff after the result pair redefinition
+  /// the fixed result pair is equal to or equivalent to the source pair of
+  /// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
+  /// the pairs (x,y) and (y,x) are equivalent.
+  virtual bool fixCommutedOpIndices(unsigned &ResultIdx1,
+                                    unsigned &ResultIdx2,
+                                    unsigned CommutableOpIdx1,
+                                    unsigned CommutableOpIdx2) const;
+
 private:
   /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
   /// set and the target hook isReallyTriviallyReMaterializable returns false,
@@ -255,18 +269,48 @@
   /// commute them, this method can overloaded to do that.
   /// The default implementation simply swaps the commutable operands.
   /// If NewMI is false, MI is modified in place and returned; otherwise, a
-  /// new machine instruction is created and returned.  Do not call this
-  /// method for a non-commutable instruction, but there may be some cases
-  /// where this method fails and returns null.
+  /// new machine instruction is created and returned.
+  ///
+  /// The overloaded version of the method with the indices of the
+  /// commuted operands may be used when the commuted instruction has
+  /// more than two operands and thus, there may be preferences in what
+  /// operand must be commuted.
+  ///
+  /// Do not call these methods for a non-commutable instruction.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
   virtual MachineInstr *commuteInstruction(MachineInstr *MI,
                                            bool NewMI = false) const;
-
-  /// If specified MI is commutable, return the two operand indices that would
-  /// swap value. Return false if the instruction
-  /// is not in a form which this routine understands.
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
+                                           bool NewMI,
+                                           unsigned Idx1,
+                                           unsigned Idx2) const;
+
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value ~0U
+  /// assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = ~0U;
+  ///     findCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
+  ///
   virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                                      unsigned &SrcOpIdx2) const;
 
+  /// Returns true if the specified MI is commutable and the operands with
+  /// indices SrcOpIdx1 and SrcOpIdx2 can swap their values.
+  /// Otherwise, returns false.
+  virtual bool areOpsCommutable(MachineInstr *MI, unsigned SrcOpIdx1,
+                                unsigned SrcOpIdx2) const;
+
   /// A pair composed of a register and a sub-register index.
   /// Used to give some type checking when modeling Reg:SubReg.
   struct RegSubRegPair {
Index: llvm/lib/CodeGen/RegisterCoalescer.cpp
===================================================================
--- llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -679,14 +679,19 @@
   unsigned UseOpIdx;
   if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
     return false;
-  unsigned Op1, Op2, NewDstIdx;
-  if (!TII->findCommutedOpIndices(DefMI, Op1, Op2))
-    return false;
-  if (Op1 == UseOpIdx)
-    NewDstIdx = Op2;
-  else if (Op2 == UseOpIdx)
-    NewDstIdx = Op1;
-  else
+
+  //
+  // FIXME: The code below tries to commute 'UseOpIdx' operand with some other
+  // commutable operand which is expressed by ~0U value passed to the method.
+  // That _other_ operand is chosen by the findCommutedOpIndices() method.
+  //
+  // That is obviously an area for improvement in case of instructions having
+  // more than 2 operands. For example, if some instruction has 3 commutable
+  // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
+  // op#2<->op#3) of commute transformation should be considered/tried here.
+  //
+  unsigned NewDstIdx = ~0U;
+  if (!TII->findCommutedOpIndices(DefMI, UseOpIdx, NewDstIdx))
     return false;
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
@@ -719,7 +724,8 @@
   // At this point we have decided that it is legal to do this
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
-  MachineInstr *NewMI = TII->commuteInstruction(DefMI);
+  MachineInstr *NewMI = TII->commuteInstruction(DefMI, false,
+                                                UseOpIdx, NewDstIdx);
   if (!NewMI)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
Index: llvm/lib/CodeGen/TargetInstrInfo.cpp
===================================================================
--- llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -118,23 +118,37 @@
   MBB->addSuccessor(NewDest);
 }
 
-// commuteInstruction - The default implementation of this method just exchanges
-// the two operands returned by findCommutedOpIndices.
+/// commuteInstruction - If a target has any instructions that are
+/// commutable but require converting to different instructions or making
+/// non-trivial changes to commute them, these methods can be overloaded to
+/// do that.  The default implementations simply swap the commutable
+/// operands.
+///
+/// If NewMI is false, MI is modified in place and returned; otherwise, a
+/// new machine instruction is created and returned.
+///
+/// The passed operand indices are used to tell what operands must
+/// be commuted.
+///
+/// Do not call this method for a non-commutable instruction.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+///
 MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
-                                                  bool NewMI) const {
+                                                  bool NewMI,
+                                                  unsigned Idx1,
+                                                  unsigned Idx2) const {
   const MCInstrDesc &MCID = MI->getDesc();
   bool HasDef = MCID.getNumDefs();
   if (HasDef && !MI->getOperand(0).isReg())
     // No idea how to commute this instruction. Target should implement its own.
     return nullptr;
-  unsigned Idx1, Idx2;
-  if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
-    assert(MI->isCommutable() && "Precondition violation: MI must be commutable.");
-    return nullptr;
-  }
 
+  assert(areOpsCommutable(MI, Idx1, Idx2) &&
+         "TargetInstrInfo::CommuteInstruction(): not commutable operands.");
   assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
          "This only knows how to commute register operands so far");
+
   unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
   unsigned Reg1 = MI->getOperand(Idx1).getReg();
   unsigned Reg2 = MI->getOperand(Idx2).getReg();
@@ -184,9 +198,83 @@
   return MI;
 }
 
-/// findCommutedOpIndices - If specified MI is commutable, return the two
-/// operand indices that would swap value. Return true if the instruction
-/// is not in a form which this routine understands.
+/// The default implementation of this method just exchanges the two operands.
+/// This method is used when the caller does not care about what operands
+/// should be commuted or when there is only one way of doing operands commute
+/// transformation, for example, when the commuted instruction has only
+/// 2 operands.
+///
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
+                                                  bool NewMI) const {
+  unsigned OpIdx1 = ~0U, OpIdx2 = ~0U;
+
+  if (!findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
+    assert(MI->isCommutable() &&
+           "Precondition violation: MI must be commutable.");
+    return nullptr;
+  }
+  return commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
+/// operand indices to (ResultIdx1, ResultIdx2).
+/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
+/// predefined to some indices or be undefined (designated by ~0U value).
+/// The predefined result indices cannot be re-defined.
+/// The function returns true iff after the result pair redefinition
+/// the fixed result pair is equal to or equivalent to the source pair of
+/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
+/// the pairs (x,y) and (y,x) are equivalent.
+///
+bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
+                                           unsigned &ResultIdx2,
+                                           unsigned CommutableOpIdx1,
+                                           unsigned CommutableOpIdx2) const {
+  if (ResultIdx1 == ~0U && ResultIdx2 == ~0U) {
+    ResultIdx1 = CommutableOpIdx1;
+    ResultIdx2 = CommutableOpIdx2;
+  }
+  else if (ResultIdx1 == ~0U) {
+    if (ResultIdx2 == CommutableOpIdx1)
+      ResultIdx1 = CommutableOpIdx2;
+    else if (ResultIdx2 == CommutableOpIdx2)
+      ResultIdx1 = CommutableOpIdx1;
+    else
+      return false;
+  }
+  else if (ResultIdx2 == ~0U) {
+    if (ResultIdx1 == CommutableOpIdx1)
+      ResultIdx2 = CommutableOpIdx2;
+    else if (ResultIdx1 == CommutableOpIdx2)
+      ResultIdx2 = CommutableOpIdx1;
+    else
+      return false;
+  }
+  else
+    // Check that the result operand indices match the given commutable
+    // operand indices.
+    return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) ||
+           (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1);
+
+  return true;
+}
+
+/// Returns true iff the routine could find two commutable operands in the
+/// given machine instruction.
+/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+/// input values can be re-defined in this method only if the input values
+/// are not pre-defined, which is designated by the special value ~0U
+/// assigned to it.
+/// If both of indices are pre-defined and refer to some operands, then the
+/// method simply returns true if the corresponding operands are commutable
+/// and returns false otherwise.
+///
+/// For example, calling this method this way:
+///     unsigned Op1 = 1, Op2 = ~0U;
+///     findCommutedOpIndices(MI, Op1, Op2);
+/// can be interpreted as a query asking to find an operand that would be
+/// commutable with the operand#1.
+///
 bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
                                             unsigned &SrcOpIdx1,
                                             unsigned &SrcOpIdx2) const {
@@ -196,10 +284,15 @@
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MCID.isCommutable())
     return false;
+
   // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
   // is not true, then the target must implement this.
-  SrcOpIdx1 = MCID.getNumDefs();
-  SrcOpIdx2 = SrcOpIdx1 + 1;
+  unsigned CommutableOpIdx1 = MCID.getNumDefs();
+  unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
+  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                            CommutableOpIdx1, CommutableOpIdx2))
+    return false;
+
   if (!MI->getOperand(SrcOpIdx1).isReg() ||
       !MI->getOperand(SrcOpIdx2).isReg())
     // No idea.
@@ -207,6 +300,18 @@
   return true;
 }
 
+/// Returns true if the specified MI is commutable and the operands with
+/// indices SrcOpIdx1 and SrcOpIdx2 can swap their values.
+/// Otherwise, returns false.
+bool TargetInstrInfo::areOpsCommutable(MachineInstr *MI,
+                                       unsigned SrcOpIdx1,
+                                       unsigned SrcOpIdx2) const {
+  unsigned OpsNum = MI->getNumOperands();
+  assert(SrcOpIdx1 < OpsNum && SrcOpIdx2 < OpsNum &&
+         "TargetInstrInfo::areOpsCommutable() illegal operand index.");
+
+  return findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
 
 bool
 TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
Index: llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
===================================================================
--- llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -110,8 +110,8 @@
   bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
                              MachineInstr *MI, unsigned Dist);
 
-  bool commuteInstruction(MachineBasicBlock::iterator &mi,
-                          unsigned RegB, unsigned RegC, unsigned Dist);
+  bool commuteInstruction(MachineInstr *MI,
+                          unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);
 
   bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
 
@@ -133,6 +133,11 @@
                                unsigned SrcIdx, unsigned DstIdx,
                                unsigned Dist, bool shouldOnlyCommute);
 
+  bool tryInstructionCommute(MachineInstr *MI,
+                             unsigned DstOpIdx,
+                             unsigned BaseOpIdx,
+                             bool BaseOpKilled,
+                             unsigned Dist);
   void scanUses(unsigned DstReg);
 
   void processCopy(MachineInstr *MI);
@@ -646,11 +651,11 @@
 /// block, distance map, and live variables if needed. Return true if it is
 /// successful.
 bool TwoAddressInstructionPass::
-commuteInstruction(MachineBasicBlock::iterator &mi,
-                   unsigned RegB, unsigned RegC, unsigned Dist) {
-  MachineInstr *MI = mi;
+commuteInstruction(MachineInstr *MI,
+                   unsigned RegBIdx, unsigned RegCIdx, unsigned Dist) {
+  unsigned RegC = MI->getOperand(RegCIdx).getReg();
   DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
-  MachineInstr *NewMI = TII->commuteInstruction(MI);
+  MachineInstr *NewMI = TII->commuteInstruction(MI, false, RegBIdx, RegCIdx);
 
   if (NewMI == nullptr) {
     DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
@@ -1155,6 +1160,51 @@
   return true;
 }
 
+/// Tries to commute the operand BaseOpIdx and some other operand in the given
+/// machine instruction to improve opportunities for coalescing and elimination
+/// of a register to register copy.
+/// Returns true if the transformation happened. Otherwise, returns false.
+///
+bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
+                                                      unsigned DstOpIdx,
+                                                      unsigned BaseOpIdx,
+                                                      bool BaseOpKilled,
+                                                      unsigned Dist) {
+  unsigned OtherOpIdx = MI->getDesc().getNumDefs();
+  for (; OtherOpIdx < MI->getDesc().getNumOperands(); OtherOpIdx++) {
+    if (OtherOpIdx != BaseOpIdx &&
+        TII->areOpsCommutable(MI, BaseOpIdx, OtherOpIdx)) {
+
+      unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
+      unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+      unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+      bool AggressiveCommute = false;
+
+      // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
+      // operands. This makes the live ranges of DstOp and OtherOp joinable.
+      bool DoCommute =
+             !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+
+      if (!DoCommute &&
+          isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
+        DoCommute = true;
+        AggressiveCommute = true;
+      }
+
+      //
+      // If it's profitable to commute, try to do so.
+      //
+      if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) {
+        ++NumCommuted;
+        if (AggressiveCommute)
+          ++NumAggrCommuted;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 /// tryInstructionTransform - For the case where an instruction has a single
 /// pair of tied register operands, attempt some transformations that may
 /// either eliminate the tied operands or improve the opportunities for
@@ -1181,31 +1231,7 @@
   if (TargetRegisterInfo::isVirtualRegister(regA))
     scanUses(regA);
 
-  // Check if it is profitable to commute the operands.
-  unsigned SrcOp1, SrcOp2;
-  unsigned regC = 0;
-  unsigned regCIdx = ~0U;
-  bool TryCommute = false;
-  bool AggressiveCommute = false;
-  if (MI.isCommutable() && MI.getNumOperands() >= 3 &&
-      TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) {
-    if (SrcIdx == SrcOp1)
-      regCIdx = SrcOp2;
-    else if (SrcIdx == SrcOp2)
-      regCIdx = SrcOp1;
-
-    if (regCIdx != ~0U) {
-      regC = MI.getOperand(regCIdx).getReg();
-      if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false))
-        // If C dies but B does not, swap the B and C operands.
-        // This makes the live ranges of A and C joinable.
-        TryCommute = true;
-      else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) {
-        TryCommute = true;
-        AggressiveCommute = true;
-      }
-    }
-  }
+  bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
 
   // If the instruction is convertible to 3 Addr, instead
   // of returning try 3 Addr transformation aggresively and
@@ -1215,17 +1241,8 @@
   //   addl	%esi, %edi
   //   movl	%edi, %eax
   //   ret
-  bool Commuted = false;
-
-  // If it's profitable to commute, try to do so.
-  if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
-    Commuted = true;
-    ++NumCommuted;
-    if (AggressiveCommute)
-      ++NumAggrCommuted;
-    if (!MI.isConvertibleTo3Addr())
-      return false;
-  }
+  if (Commuted && !MI.isConvertibleTo3Addr())
+    return false;
 
   if (shouldOnlyCommute)
     return false;
Index: llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -164,8 +164,8 @@
 
     // Operand is not legal, so try to commute the instruction to
     // see if this makes it possible to fold.
-    unsigned CommuteIdx0;
-    unsigned CommuteIdx1;
+    unsigned CommuteIdx0 = ~0U;
+    unsigned CommuteIdx1 = ~0U;
     bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
 
     if (CanCommute) {
@@ -175,7 +175,13 @@
         OpNo = CommuteIdx0;
     }
 
-    if (!CanCommute || !TII->commuteInstruction(MI))
+    // FIXME: OpNo can be commuted with non-reg operand OtherOpNo, but
+    // such test cases are not handled well yet.
+    if (CanCommute &&
+        (!MI->getOperand(CommuteIdx0).isReg() || !MI->getOperand(CommuteIdx1).isReg()))
+      return false;
+
+    if (!CanCommute || !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
     if (!TII->isOperandLegal(MI, OpNo, OpToFold))
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -120,7 +120,10 @@
   int commuteOpcode(const MachineInstr &MI) const;
 
   MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const override;
+                                   bool NewMI,
+                                   unsigned OpIdx0,
+                                   unsigned OpIdx1) const override;
+
   bool findCommutedOpIndices(MachineInstr *MI,
                              unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -764,8 +764,18 @@
   return true;
 }
 
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+///
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
-                                              bool NewMI) const {
+                                              bool NewMI,
+                                              unsigned OpIdx0,
+                                              unsigned OpIdx1) const {
 
   if (MI->getNumOperands() < 3)
     return nullptr;
@@ -784,7 +794,12 @@
 
   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                            AMDGPU::OpName::src1);
-  if (Src1Idx == -1)
+  assert(Src1Idx != -1 && "Should always have src1 operand");
+
+  if (!(OpIdx0 == static_cast<unsigned>(Src0Idx) &&
+        OpIdx1 == static_cast<unsigned>(Src1Idx)) &&
+      !(OpIdx0 == static_cast<unsigned>(Src1Idx) &&
+        OpIdx1 == static_cast<unsigned>(Src0Idx)))
     return nullptr;
 
   MachineOperand &Src1 = MI->getOperand(Src1Idx);
@@ -832,7 +847,7 @@
     Src1.ChangeToRegister(Reg, false);
     Src1.setSubReg(SubReg);
   } else {
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx0, OpIdx1);
   }
 
   if (MI)
@@ -845,8 +860,8 @@
 // between the true commutable operands, and the base
 // TargetInstrInfo::commuteInstruction uses it.
 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
-                                        unsigned &SrcOpIdx1,
-                                        unsigned &SrcOpIdx2) const {
+                                        unsigned &SrcOpIdx0,
+                                        unsigned &SrcOpIdx1) const {
   const MCInstrDesc &MCID = MI->getDesc();
   if (!MCID.isCommutable())
     return false;
@@ -857,7 +872,8 @@
     return false;
 
   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
-  // immediate.
+  // immediate. Also, immeditate src0 operand is not handled in
+  // SIInstrInfo::commuteInstruction();
   if (!MI->getOperand(Src0Idx).isReg())
     return false;
 
@@ -865,18 +881,24 @@
   if (Src1Idx == -1)
     return false;
 
-  if (!MI->getOperand(Src1Idx).isReg())
-    return false;
-
-  // If any source modifiers are set, the generic instruction commuting won't
-  // understand how to copy the source modifiers.
-  if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
-      hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+  MachineOperand &Src1 = MI->getOperand(Src1Idx);
+  if (Src1.isImm()) {
+    // SIInstrInfo::commuteInstruction() does support commuting the immediate
+    // operand src1 in 2 and 3 operand instructions.
+    if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
+      return false;
+  }
+  else if (Src1.isReg()) {
+    // If any source modifiers are set, the generic instruction commuting won't
+    // understand how to copy the source modifiers.
+    if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
+        hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
+      return false;
+  }
+  else
     return false;
 
-  SrcOpIdx1 = Src0Idx;
-  SrcOpIdx2 = Src1Idx;
-  return true;
+  return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
 }
 
 MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
@@ -1723,7 +1745,7 @@
     // than src1, so try to commute the instruction to decrease our
     // chances of having to insert a MOV instruction to legalize src1.
     if (MI->isCommutable()) {
-      if (commuteInstruction(MI))
+      if (TargetInstrInfo::commuteInstruction(MI))
         // If we are successful in commuting, then we know MI is legal, so
         // we are done.
         return;
Index: llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -182,7 +182,8 @@
   }
 
   // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI))
+  if (TryToCommute && MI.isCommutable() &&
+      TII->TargetInstrInfo::commuteInstruction(&MI))
     foldImmediates(MI, TII, MRI, false);
 
 }
@@ -221,7 +222,8 @@
       if (!canShrink(MI, TII, TRI, MRI)) {
         // Try commuting the instruction and see if that enables us to shrink
         // it.
-        if (!MI.isCommutable() || !TII->commuteInstruction(&MI) ||
+        if (!MI.isCommutable() ||
+            !TII->TargetInstrInfo::commuteInstruction(&MI) ||
             !canShrink(MI, TII, TRI, MRI))
           continue;
       }
Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -188,8 +188,10 @@
   MachineInstr *duplicate(MachineInstr *Orig,
                           MachineFunction &MF) const override;
 
-  MachineInstr *commuteInstruction(MachineInstr*,
-                                   bool=false) const override;
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI,
+                                   unsigned OpIdx1,
+                                   unsigned OpIdx2) const override;
 
   const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                                      unsigned SubIdx, unsigned State,
Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1744,9 +1744,17 @@
   llvm_unreachable("Unknown unconditional branch opcode!");
 }
 
-/// commuteInstruction - Handle commutable instructions.
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+///
 MachineInstr *
-ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
+                                     unsigned OpIdx1, unsigned OpIdx2) const {
   switch (MI->getOpcode()) {
   case ARM::MOVCCr:
   case ARM::t2MOVCCr: {
@@ -1756,7 +1764,7 @@
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
       return nullptr;
-    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
     if (!MI)
       return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
@@ -1765,7 +1773,7 @@
     return MI;
   }
   }
-  return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
Index: llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
===================================================================
--- llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -654,17 +654,18 @@
       if (Reg1 != Reg0)
         return false;
       // Try to commute the operands to make it a 2-address instruction.
-      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      MachineInstr *CommutedMI = TII->TargetInstrInfo::commuteInstruction(MI);
       if (!CommutedMI)
         return false;
     }
   } else if (Reg0 != Reg1) {
     // Try to commute the operands to make it a 2-address instruction.
-    unsigned CommOpIdx1, CommOpIdx2;
+    unsigned CommOpIdx1 = 1, CommOpIdx2 = ~0U;
     if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
-        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+        MI->getOperand(CommOpIdx2).getReg() != Reg0)
       return false;
-    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    MachineInstr *CommutedMI = TII->commuteInstruction(MI, false,
+                                                       CommOpIdx1, CommOpIdx2);
     if (!CommutedMI)
       return false;
   }
Index: llvm/lib/Target/PowerPC/PPCInstrInfo.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -159,9 +159,21 @@
   unsigned isStoreToStackSlot(const MachineInstr *MI,
                               int &FrameIndex) const override;
 
-  // commuteInstruction - We can commute rlwimi instructions, but only if the
-  // rotate amt is zero.  We also have to munge the immediates a bit.
-  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
+  /// Commutes the operands in the given instruction.
+  /// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+  ///
+  /// Do not call this method for a non-commutable instruction or for
+  /// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  ///
+  /// For example, we can commute rlwimi instructions, but only if the
+  /// rotate amt is zero.  We also have to munge the immediates a bit.
+  ///
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI,
+                                   unsigned OpIdx1,
+                                   unsigned OpIdx2) const override;
 
   bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
Index: llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -521,16 +521,26 @@
   return 0;
 }
 
-// commuteInstruction - We can commute rlwimi instructions, but only if the
-// rotate amt is zero.  We also have to munge the immediates a bit.
+/// Commutes the operands in the given instruction.
+/// The commutable operands are specified by their indices OpIdx1 and OpIdx2.
+///
+/// Do not call this method for a non-commutable instruction or for
+/// non-commutable pair of operand indices OpIdx1 and OpIdx2.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
+///
+/// For example, we can commute rlwimi instructions, but only if the
+/// rotate amt is zero.  We also have to munge the immediates a bit.
+///
 MachineInstr *
-PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
+                                 unsigned OpIdx1, unsigned OpIdx2) const {
   MachineFunction &MF = *MI->getParent()->getParent();
 
   // Normal instructions can be commuted the obvious way.
   if (MI->getOpcode() != PPC::RLWIMI &&
       MI->getOpcode() != PPC::RLWIMIo)
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
   // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
   // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
   // changing the relative order of the mask operands might change what happens
@@ -548,6 +558,8 @@
   //   Op0 = (Op2 & ~M) | (Op1 & M)
 
   // Swap op1/op2
+  assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) &&
+         "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo.");
   unsigned Reg0 = MI->getOperand(0).getReg();
   unsigned Reg1 = MI->getOperand(1).getReg();
   unsigned Reg2 = MI->getOperand(2).getReg();
@@ -610,9 +622,9 @@
   if (AltOpc == -1)
     return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
 
-  SrcOpIdx1 = 2;
-  SrcOpIdx2 = 3;
-  return true;
+  // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1
+  // and SrcOpIdx2.
+  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
 }
 
 void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
Index: llvm/lib/Target/X86/X86InstrFMA.td
===================================================================
--- llvm/lib/Target/X86/X86InstrFMA.td
+++ llvm/lib/Target/X86/X86InstrFMA.td
@@ -60,27 +60,47 @@
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
-  // For 213, both the register and memory variant are commutable.
-  // Indeed, the commutable operands are 1 and 2 and both live in registers
-  // for both variants.
+let hasSideEffects = 0 in {
+  // For 213, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+  //   operands 1 and 3 (register forms only):     *213* --> *231*;
+  //   operands 2 and 3 (register forms only):     *213* --> *132*.
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, "213", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256,
                        /* IsRVariantCommutable */ 1,
                        /* IsMVariantCommutable */ 1,
                        Op>;
-let hasSideEffects = 0 in {
+  // For 132, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *132* --> *231*;
+  //   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
+  //   operands 2 and 3 (register forms only):     *132* --> *213*.
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
-  // For 231, only the register variant is commutable.
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1>;
+  // For 231, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
   // For the memory variant the folded operand must be in 3. Thus,
-  // in that case, it cannot be swapped with 2.
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *231* --> *132*;
+  //   operands 1 and 3 (register forms only):     *231* --> *213*;
+  //   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
   defm r231 : fma3p_rm<opc231,
                        !strconcat(OpcodeStr, "231", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256,
                        /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 0>;
+                       /* IsMVariantCommutable */ 1>;
 } // hasSideEffects = 0
 }
 
@@ -156,23 +176,54 @@
                        X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
                        ComplexPattern mem_cpat> {
 let hasSideEffects = 0 in {
+  // For 132, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *132* --> *231*;
+  //   operands 1 and 3 (register forms only):     *132* --> *132*(no changes);
+  //   operands 2 and 3 (register forms only):     *132* --> *213*.
+  // Commuting the operand 1 with some other operand changes the upper bits
+  // of the result FMA instruction. Thus, it requires a proof of the fact that
+  // only the lowest element of the result is used.
   defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                       x86memop, RC, OpVT, mem_frag>;
-  // See the other defm of r231 for the explanation regarding the
-  // commutable flags.
+                       x86memop, RC, OpVT, mem_frag,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1>;
+  // For 231, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *231* --> *132*;
+  //   operands 1 and 3 (register forms only):     *231* --> *213*;
+  //   operands 2 and 3 (register forms only):     *231* --> *231*(no changes).
+  // Commuting the operand 1 with some other operand changes the upper bits
+  // of the result FMA instruction. Thus, it requires a proof of the fact that
+  // only the lowest element of the result is used.
   defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
                        x86memop, RC, OpVT, mem_frag,
                        /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 0>;
-}
+                       /* IsMVariantCommutable */ 1>;
 
-// See the other defm of r213 for the explanation regarding the
-// commutable flags.
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag,
-                     /* IsRVariantCommutable */ 1,
-                     /* IsMVariantCommutable */ 1,
-                     OpNode>;
+  // For 213, both the register and memory variants are commutable.
+  // For the register form the commutable operands are 1, 2 and 3.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, only the operands 1 and 2 can be swapped.
+  // Commuting some of operands may require the opcode change:
+  //   operands 1 and 2 (memory & register forms): *213* --> *213*(no changes);
+  //   operands 1 and 3 (register forms only):     *213* --> *231*;
+  //   operands 2 and 3 (register forms only):     *213* --> *132*.
+  // Commuting the operand 1 with some other operand changes the upper bits
+  // of the result FMA instruction. Thus, it requires a proof of the fact that
+  // only the lowest element of the result is used.
+  defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                       x86memop, RC, OpVT, mem_frag,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1,
+                       OpNode>;
+}
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
Index: llvm/lib/Target/X86/X86InstrInfo.h
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.h
+++ llvm/lib/Target/X86/X86InstrInfo.h
@@ -259,14 +259,85 @@
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const override;
 
-  /// commuteInstruction - We have a few instructions that must be hacked on to
-  /// commute them.
+  /// Commutes the operands in the given instruction by changing the operands
+  /// order and/or changing the instruction's opcode and/or the immediate value
+  /// operand.
+  ///
+  /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
+  /// to be commuted.
+  ///
+  /// Do not call this method for a non-commutable instruction.
+  /// Even though the instruction is commutable, the method may still
+  /// fail to commute the operands, null pointer is returned in such cases.
+  ///
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI,
+                                   unsigned CommuteOpIdx1,
+                                   unsigned CommuteOpIdx2) const override;
+
+  /// Returns true iff the routine could find two commutable operands in the
+  /// given machine instruction.
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+  /// input values can be re-defined in this method only if the input values
+  /// are not pre-defined, which is designated by the special value ~0U
+  /// assigned to it.
+  /// If both of indices are pre-defined and refer to some operands, then the
+  /// method simply returns true if the corresponding operands are commutable
+  /// and returns false otherwise.
+  ///
+  /// For example, calling this method this way:
+  ///     unsigned Op1 = 1, Op2 = ~0U;
+  ///     findCommutedOpIndices(MI, Op1, Op2);
+  /// can be interpreted as a query asking to find an operand that would be
+  /// commutable with the operand#1.
   ///
-  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
-
   bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
+  /// Returns true if the routine could find two commutable operands
+  /// in the given FMA instruction. Otherwise, returns false.
+  ///
+  /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
+  /// The output indices of the commuted operands are returned in these
+  /// arguments. Also, the input values of these arguments may be preset either
+  /// to indices of operands that must be commuted or be equal to a special
+  /// value (~0U) which means that the corresponding operand index is not set
+  /// and this method is free to pick any of available commutable operands.
+  ///
+  /// For example, calling this method this way:
+  ///     findFMA3CommutedOpIndices(MI, 1, ~0U);
+  /// can be interpreted as a query asking if the operand #1 can be swapped
+  /// with any other available operand (e.g. operand #2, operand #3, etc.).
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  ///
+  bool findFMA3CommutedOpIndices(MachineInstr *MI,
+                                 unsigned &SrcOpIdx1,
+                                 unsigned &SrcOpIdx2) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given MI but which has the operands
+  /// SrcOpIdx1 and SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  ///
+  unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2) const;
+
+  /// Returns true if the given instruction opcode is FMA3.
+  /// Otherwise, returns false.
+  ///
+  bool isFMA3(unsigned Opcode) const;
+
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
Index: llvm/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.cpp
+++ llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2923,10 +2923,21 @@
   return NewMI;
 }
 
-/// We have a few instructions that must be hacked on to commute them.
+/// Commutes the operands in the given instruction by changing the operands
+/// order and/or changing the instruction's opcode and/or the immediate value
+/// operand.
+/// The arguments 'OpIdx1' and 'OpIdx2' specify the operands to be commuted.
+///
+/// Do not call this method for a non-commutable instruction.
+/// Even though the instruction is commutable, the method may still
+/// fail to commute the operands, null pointer is returned in such cases.
 ///
 MachineInstr *
-X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI,
+                                 unsigned OpIdx1,
+                                 unsigned OpIdx2) const {
+
+
   switch (MI->getOpcode()) {
   case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
   case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
@@ -2953,7 +2964,7 @@
     }
     MI->setDesc(get(Opc));
     MI->getOperand(3).setImm(Size-Amt);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
@@ -2989,7 +3000,7 @@
       NewMI = false;
     }
     MI->getOperand(3).setImm(Mask ^ Imm);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::PCLMULQDQrr:
   case X86::VPCLMULQDQrr:{
@@ -3004,7 +3015,7 @@
       NewMI = false;
     }
     MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::CMPPDrri:
   case X86::CMPPSrri:
@@ -3025,7 +3036,7 @@
         MI = MF.CloneMachineInstr(MI);
         NewMI = false;
       }
-      return TargetInstrInfo::commuteInstruction(MI, NewMI);
+      return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
     default:
       return nullptr;
     }
@@ -3054,7 +3065,7 @@
       NewMI = false;
     }
     MI->getOperand(3).setImm(Imm);
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
   }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
@@ -3133,11 +3144,407 @@
     // Fallthrough intended.
   }
   default:
-    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+    if (isFMA3(MI->getOpcode())) {
+      unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2);
+      if (Opc == 0) {
+        return nullptr;
+      }
+      if (NewMI) {
+        MachineFunction &MF = *MI->getParent()->getParent();
+        MI = MF.CloneMachineInstr(MI);
+        NewMI = false;
+      }
+      MI->setDesc(get(Opc));
+      return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
+    }
+    return TargetInstrInfo::commuteInstruction(MI, NewMI, OpIdx1, OpIdx2);
+  }
+}
+
+///
+/// Returns true if the given instruction opcode is FMA3.
+/// Otherwise, returns false.
+///
+bool X86InstrInfo::isFMA3(unsigned Opcode) const {
+  switch (Opcode) {
+    case X86::VFMADDSDr132r:     case X86::VFMADDSDr132m:
+    case X86::VFMADDSSr132r:     case X86::VFMADDSSr132m:
+    case X86::VFMSUBSDr132r:     case X86::VFMSUBSDr132m:
+    case X86::VFMSUBSSr132r:     case X86::VFMSUBSSr132m:
+    case X86::VFNMADDSDr132r:    case X86::VFNMADDSDr132m:
+    case X86::VFNMADDSSr132r:    case X86::VFNMADDSSr132m:
+    case X86::VFNMSUBSDr132r:    case X86::VFNMSUBSDr132m:
+    case X86::VFNMSUBSSr132r:    case X86::VFNMSUBSSr132m:
+
+    case X86::VFMADDSDr213r:     case X86::VFMADDSDr213m:
+    case X86::VFMADDSSr213r:     case X86::VFMADDSSr213m:
+    case X86::VFMSUBSDr213r:     case X86::VFMSUBSDr213m:
+    case X86::VFMSUBSSr213r:     case X86::VFMSUBSSr213m:
+    case X86::VFNMADDSDr213r:    case X86::VFNMADDSDr213m:
+    case X86::VFNMADDSSr213r:    case X86::VFNMADDSSr213m:
+    case X86::VFNMSUBSDr213r:    case X86::VFNMSUBSDr213m:
+    case X86::VFNMSUBSSr213r:    case X86::VFNMSUBSSr213m:
+
+    case X86::VFMADDSDr231r:     case X86::VFMADDSDr231m:
+    case X86::VFMADDSSr231r:     case X86::VFMADDSSr231m:
+    case X86::VFMSUBSDr231r:     case X86::VFMSUBSDr231m:
+    case X86::VFMSUBSSr231r:     case X86::VFMSUBSSr231m:
+    case X86::VFNMADDSDr231r:    case X86::VFNMADDSDr231m:
+    case X86::VFNMADDSSr231r:    case X86::VFNMADDSSr231m:
+    case X86::VFNMSUBSDr231r:    case X86::VFNMSUBSDr231m:
+    case X86::VFNMSUBSSr231r:    case X86::VFNMSUBSSr231m:
+
+    case X86::VFMADDSUBPDr132r:  case X86::VFMADDSUBPDr132m:
+    case X86::VFMADDSUBPSr132r:  case X86::VFMADDSUBPSr132m:
+    case X86::VFMSUBADDPDr132r:  case X86::VFMSUBADDPDr132m:
+    case X86::VFMSUBADDPSr132r:  case X86::VFMSUBADDPSr132m:
+    case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY:
+    case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY:
+    case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY:
+    case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY:
+
+    case X86::VFMADDPDr132r:     case X86::VFMADDPDr132m:
+    case X86::VFMADDPSr132r:     case X86::VFMADDPSr132m:
+    case X86::VFMSUBPDr132r:     case X86::VFMSUBPDr132m:
+    case X86::VFMSUBPSr132r:     case X86::VFMSUBPSr132m:
+    case X86::VFNMADDPDr132r:    case X86::VFNMADDPDr132m:
+    case X86::VFNMADDPSr132r:    case X86::VFNMADDPSr132m:
+    case X86::VFNMSUBPDr132r:    case X86::VFNMSUBPDr132m:
+    case X86::VFNMSUBPSr132r:    case X86::VFNMSUBPSr132m:
+    case X86::VFMADDPDr132rY:    case X86::VFMADDPDr132mY:
+    case X86::VFMADDPSr132rY:    case X86::VFMADDPSr132mY:
+    case X86::VFMSUBPDr132rY:    case X86::VFMSUBPDr132mY:
+    case X86::VFMSUBPSr132rY:    case X86::VFMSUBPSr132mY:
+    case X86::VFNMADDPDr132rY:   case X86::VFNMADDPDr132mY:
+    case X86::VFNMADDPSr132rY:   case X86::VFNMADDPSr132mY:
+    case X86::VFNMSUBPDr132rY:   case X86::VFNMSUBPDr132mY:
+    case X86::VFNMSUBPSr132rY:   case X86::VFNMSUBPSr132mY:
+
+    case X86::VFMADDSUBPDr213r:  case X86::VFMADDSUBPDr213m:
+    case X86::VFMADDSUBPSr213r:  case X86::VFMADDSUBPSr213m:
+    case X86::VFMSUBADDPDr213r:  case X86::VFMSUBADDPDr213m:
+    case X86::VFMSUBADDPSr213r:  case X86::VFMSUBADDPSr213m:
+    case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY:
+    case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY:
+    case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY:
+    case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY:
+
+    case X86::VFMADDPDr213r:     case X86::VFMADDPDr213m:
+    case X86::VFMADDPSr213r:     case X86::VFMADDPSr213m:
+    case X86::VFMSUBPDr213r:     case X86::VFMSUBPDr213m:
+    case X86::VFMSUBPSr213r:     case X86::VFMSUBPSr213m:
+    case X86::VFNMADDPDr213r:    case X86::VFNMADDPDr213m:
+    case X86::VFNMADDPSr213r:    case X86::VFNMADDPSr213m:
+    case X86::VFNMSUBPDr213r:    case X86::VFNMSUBPDr213m:
+    case X86::VFNMSUBPSr213r:    case X86::VFNMSUBPSr213m:
+    case X86::VFMADDPDr213rY:    case X86::VFMADDPDr213mY:
+    case X86::VFMADDPSr213rY:    case X86::VFMADDPSr213mY:
+    case X86::VFMSUBPDr213rY:    case X86::VFMSUBPDr213mY:
+    case X86::VFMSUBPSr213rY:    case X86::VFMSUBPSr213mY:
+    case X86::VFNMADDPDr213rY:   case X86::VFNMADDPDr213mY:
+    case X86::VFNMADDPSr213rY:   case X86::VFNMADDPSr213mY:
+    case X86::VFNMSUBPDr213rY:   case X86::VFNMSUBPDr213mY:
+    case X86::VFNMSUBPSr213rY:   case X86::VFNMSUBPSr213mY:
+
+    case X86::VFMADDSUBPDr231r:  case X86::VFMADDSUBPDr231m:
+    case X86::VFMADDSUBPSr231r:  case X86::VFMADDSUBPSr231m:
+    case X86::VFMSUBADDPDr231r:  case X86::VFMSUBADDPDr231m:
+    case X86::VFMSUBADDPSr231r:  case X86::VFMSUBADDPSr231m:
+    case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY:
+    case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY:
+    case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY:
+    case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY:
+
+    case X86::VFMADDPDr231r:     case X86::VFMADDPDr231m:
+    case X86::VFMADDPSr231r:     case X86::VFMADDPSr231m:
+    case X86::VFMSUBPDr231r:     case X86::VFMSUBPDr231m:
+    case X86::VFMSUBPSr231r:     case X86::VFMSUBPSr231m:
+    case X86::VFNMADDPDr231r:    case X86::VFNMADDPDr231m:
+    case X86::VFNMADDPSr231r:    case X86::VFNMADDPSr231m:
+    case X86::VFNMSUBPDr231r:    case X86::VFNMSUBPDr231m:
+    case X86::VFNMSUBPSr231r:    case X86::VFNMSUBPSr231m:
+    case X86::VFMADDPDr231rY:    case X86::VFMADDPDr231mY:
+    case X86::VFMADDPSr231rY:    case X86::VFMADDPSr231mY:
+    case X86::VFMSUBPDr231rY:    case X86::VFMSUBPDr231mY:
+    case X86::VFMSUBPSr231rY:    case X86::VFMSUBPSr231mY:
+    case X86::VFNMADDPDr231rY:   case X86::VFNMADDPDr231mY:
+    case X86::VFNMADDPSr231rY:   case X86::VFNMADDPSr231mY:
+    case X86::VFNMSUBPDr231rY:   case X86::VFNMSUBPDr231mY:
+    case X86::VFNMSUBPSr231rY:   case X86::VFNMSUBPSr231mY:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+///
+/// Returns true if the routine could find two commutable operands
+/// in the given FMA instruction. Otherwise, returns false.
+///
+/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
+/// The output indices of the commuted operands are returned in these
+/// arguments. Also, the input values of these arguments may be preset either
+/// to indices of operands that must be commuted or be equal to a special
+/// value (~0U) which means that the corresponding operand index is not set
+/// and this method is free to pick any of available commutable operands.
+///
+/// For example, calling this method this way:
+///     findFMA3CommutedOpIndices(MI, 1, ~0U);
+/// can be interpreted as a query asking if the operand #1 can be swapped
+/// with any other available operand (e.g. operand #2, operand #3, etc.).
+///
+/// The returned FMA opcode may differ from the opcode in the given MI.
+/// For example, commuting the operands #1 and #3 in the following FMA
+///     FMA213 #1, #2, #3
+/// results into instruction with adjusted opcode:
+///     FMA231 #3, #2, #1
+///
+bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI,
+                                             unsigned &SrcOpIdx1,
+                                             unsigned &SrcOpIdx2) const {
+
+  unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+
+  //
+  // Only the first RegOpsNum operands are commutable.
+  // Also, the value ~0U is valid here as it means that the operand is not
+  // specified/fixed.
+  //
+  if (SrcOpIdx1 < 1 || (SrcOpIdx1 > RegOpsNum && SrcOpIdx1 != ~0U) ||
+      SrcOpIdx2 < 1 || (SrcOpIdx2 > RegOpsNum && SrcOpIdx2 != ~0U)) {
+    return false;
   }
+
+  if (SrcOpIdx1 == ~0U || SrcOpIdx2 == ~0U) {
+    unsigned CommutableOpIdx1 = SrcOpIdx1;
+    unsigned CommutableOpIdx2 = SrcOpIdx2;
+
+    //
+    // At least one of operands to be commuted is not specified and
+    // this method is free to choose appropriate commutable operands.
+    //
+    if (SrcOpIdx1 == SrcOpIdx2) {
+      // Both of operands are not fixed. By default set one of commutable
+      // operands to the last operand of the instruction.
+      //
+      CommutableOpIdx2 = RegOpsNum;
+    }
+    else if (SrcOpIdx2 == ~0U) {
+      // Only one of operands is not fixed.
+      //
+      CommutableOpIdx2 = SrcOpIdx1;
+    }
+
+    // CommutableOpIdx2 is well defined now. Let's choose another commutable
+    // operand and assign its index to CommutableOpIdx1.
+    //
+    unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg();
+    for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+      // The commuted operands must have different registers.
+      // Otherwise, the commute transformation does not change anything and
+      // is useless then.
+      //
+      if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg())
+        break;
+    }
+
+    // No appropriate commutable operands were found.
+    //
+    if (CommutableOpIdx1 == 0)
+      return false;
+
+    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
+    // to return those values.
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                              CommutableOpIdx1, CommutableOpIdx2))
+      return false;
+  }
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0;
 }
 
-bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+///
+/// Returns an adjusted FMA opcode that must be used in FMA instruction that
+/// performs the same computations as the given MI but which has the operands
+/// SrcOpIdx1 and SrcOpIdx2 commuted.
+/// It may return 0 if it is unsafe to commute the operands.
+///
+/// The returned FMA opcode may differ from the opcode in the given MI.
+/// For example, commuting the operands #1 and #3 in the following FMA
+///     FMA213 #1, #2, #3
+/// results into instruction with adjusted opcode:
+///     FMA231 #3, #2, #1
+///
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI,
+                                                      unsigned SrcOpIdx1,
+                                                      unsigned SrcOpIdx2) const {
+  int RetOpc = 0;
+  int Opc = MI->getOpcode();
+
+  //
+  // Struct which describes FMA opcodes and dependencies between them.
+  //
+  static const struct {
+    int Opc1;
+    int Opc2;
+    int Opc3;
+    bool IsScalar;
+  } OpcodeAlts[] = {
+    { X86::VFMADDSSr132r,   X86::VFMADDSSr213r,   X86::VFMADDSSr231r, true  },
+    { X86::VFMADDSDr132r,   X86::VFMADDSDr213r,   X86::VFMADDSDr231r, true  },
+    { X86::VFMADDPSr132r,   X86::VFMADDPSr213r,   X86::VFMADDPSr231r, false },
+    { X86::VFMADDPDr132r,   X86::VFMADDPDr213r,   X86::VFMADDPDr231r, false },
+    { X86::VFMADDPSr132rY,  X86::VFMADDPSr213rY,  X86::VFMADDPSr231rY,false },
+    { X86::VFMADDPDr132rY,  X86::VFMADDPDr213rY,  X86::VFMADDPDr231rY,false },
+    { X86::VFMADDSSr132m,   X86::VFMADDSSr213m,   X86::VFMADDSSr231m, true  },
+    { X86::VFMADDSDr132m,   X86::VFMADDSDr213m,   X86::VFMADDSDr231m, true  },
+    { X86::VFMADDPSr132m,   X86::VFMADDPSr213m,   X86::VFMADDPSr231m, false },
+    { X86::VFMADDPDr132m,   X86::VFMADDPDr213m,   X86::VFMADDPDr231m, false },
+    { X86::VFMADDPSr132mY,  X86::VFMADDPSr213mY,  X86::VFMADDPSr231mY,false },
+    { X86::VFMADDPDr132mY,  X86::VFMADDPDr213mY,  X86::VFMADDPDr231mY,false },
+
+    { X86::VFMSUBSSr132r,   X86::VFMSUBSSr213r,   X86::VFMSUBSSr231r, true  },
+    { X86::VFMSUBSDr132r,   X86::VFMSUBSDr213r,   X86::VFMSUBSDr231r, true  },
+    { X86::VFMSUBPSr132r,   X86::VFMSUBPSr213r,   X86::VFMSUBPSr231r, false },
+    { X86::VFMSUBPDr132r,   X86::VFMSUBPDr213r,   X86::VFMSUBPDr231r, false },
+    { X86::VFMSUBPSr132rY,  X86::VFMSUBPSr213rY,  X86::VFMSUBPSr231rY,false },
+    { X86::VFMSUBPDr132rY,  X86::VFMSUBPDr213rY,  X86::VFMSUBPDr231rY,false },
+    { X86::VFMSUBSSr132m,   X86::VFMSUBSSr213m,   X86::VFMSUBSSr231m, true  },
+    { X86::VFMSUBSDr132m,   X86::VFMSUBSDr213m,   X86::VFMSUBSDr231m, true  },
+    { X86::VFMSUBPSr132m,   X86::VFMSUBPSr213m,   X86::VFMSUBPSr231m, false },
+    { X86::VFMSUBPDr132m,   X86::VFMSUBPDr213m,   X86::VFMSUBPDr231m, false },
+    { X86::VFMSUBPSr132mY,  X86::VFMSUBPSr213mY,  X86::VFMSUBPSr231mY,false },
+    { X86::VFMSUBPDr132mY,  X86::VFMSUBPDr213mY,  X86::VFMSUBPDr231mY,false },
+    
+    { X86::VFNMADDSSr132r,  X86::VFNMADDSSr213r,  X86::VFNMADDSSr231r, true  },
+    { X86::VFNMADDSDr132r,  X86::VFNMADDSDr213r,  X86::VFNMADDSDr231r, true  },
+    { X86::VFNMADDPSr132r,  X86::VFNMADDPSr213r,  X86::VFNMADDPSr231r, false },
+    { X86::VFNMADDPDr132r,  X86::VFNMADDPDr213r,  X86::VFNMADDPDr231r, false },
+    { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY,false },
+    { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY,false },
+    { X86::VFNMADDSSr132m,  X86::VFNMADDSSr213m,  X86::VFNMADDSSr231m, true  },
+    { X86::VFNMADDSDr132m,  X86::VFNMADDSDr213m,  X86::VFNMADDSDr231m, true  },
+    { X86::VFNMADDPSr132m,  X86::VFNMADDPSr213m,  X86::VFNMADDPSr231m, false },
+    { X86::VFNMADDPDr132m,  X86::VFNMADDPDr213m,  X86::VFNMADDPDr231m, false },
+    { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY,false },
+    { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY,false },
+
+    { X86::VFNMSUBSSr132r,  X86::VFNMSUBSSr213r,  X86::VFNMSUBSSr231r, true  },
+    { X86::VFNMSUBSDr132r,  X86::VFNMSUBSDr213r,  X86::VFNMSUBSDr231r, true  },
+    { X86::VFNMSUBPSr132r,  X86::VFNMSUBPSr213r,  X86::VFNMSUBPSr231r, false },
+    { X86::VFNMSUBPDr132r,  X86::VFNMSUBPDr213r,  X86::VFNMSUBPDr231r, false },
+    { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY,false },
+    { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY,false },
+    { X86::VFNMSUBSSr132m,  X86::VFNMSUBSSr213m,  X86::VFNMSUBSSr231m, true  },
+    { X86::VFNMSUBSDr132m,  X86::VFNMSUBSDr213m,  X86::VFNMSUBSDr231m, true  },
+    { X86::VFNMSUBPSr132m,  X86::VFNMSUBPSr213m,  X86::VFNMSUBPSr231m, false },
+    { X86::VFNMSUBPDr132m,  X86::VFNMSUBPDr213m,  X86::VFNMSUBPDr231m, false },
+    { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY,false },
+    { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY,false },
+
+    { X86::VFMADDSUBPSr132r,  X86::VFMADDSUBPSr213r,  X86::VFMADDSUBPSr231r, false },
+    { X86::VFMADDSUBPDr132r,  X86::VFMADDSUBPDr213r,  X86::VFMADDSUBPDr231r, false },
+    { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY,false },
+    { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY,false },
+    { X86::VFMADDSUBPSr132m,  X86::VFMADDSUBPSr213m,  X86::VFMADDSUBPSr231m, false },
+    { X86::VFMADDSUBPDr132m,  X86::VFMADDSUBPDr213m,  X86::VFMADDSUBPDr231m, false },
+    { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY,false },
+    { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY,false },
+
+    { X86::VFMSUBADDPSr132r,  X86::VFMSUBADDPSr213r,  X86::VFMSUBADDPSr231r, false },
+    { X86::VFMSUBADDPDr132r,  X86::VFMSUBADDPDr213r,  X86::VFMSUBADDPDr231r, false },
+    { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY,false },
+    { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY,false },
+    { X86::VFMSUBADDPSr132m,  X86::VFMSUBADDPSr213m,  X86::VFMSUBADDPSr231m, false },
+    { X86::VFMSUBADDPDr132m,  X86::VFMSUBADDPDr213m,  X86::VFMSUBADDPDr231m, false },
+    { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY,false },
+    { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY,false }
+  };
+
+  unsigned char OpcodeAltsNum = sizeof(OpcodeAlts) / sizeof(OpcodeAlts[0]);
+  int i, pos = 0;
+  for (i = 0; i < OpcodeAltsNum; i++) {
+    if (OpcodeAlts[i].Opc2 == Opc) {
+      pos = 2;
+      break;
+    }
+    if (OpcodeAlts[i].Opc1 == Opc) {
+      pos = 1;
+      break;
+    }
+    if (OpcodeAlts[i].Opc3 == Opc) {
+      pos = 3;
+      break;
+    }
+  }
+
+  //
+  // Input opcode does not match with any from the table.
+  //
+  if (pos == 0)
+    return 0;
+
+  // FIXME: Commuting the 1st operand of scalar FMA requires some additional
+  // analysis such as getting proof of the fact that all uses of the
+  // given FMA instruction use only the lowest element. Without proving
+  // that commuting the 1st operand of scalar FMAs changes the upper bits
+  // of the result.
+  //
+  if (OpcodeAlts[i].IsScalar && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1))
+    return 0;
+
+  //
+  // Find reversed FMA opcode.
+  //
+  if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 2) ||
+      (SrcOpIdx1 == 2 && SrcOpIdx2 == 1)) {
+    if (pos == 1)
+      RetOpc = OpcodeAlts[i].Opc3;
+    else if (pos == 2)
+      RetOpc = Opc;
+    else
+      RetOpc = OpcodeAlts[i].Opc1;
+  }
+  else if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 3) ||
+           (SrcOpIdx1 == 3 && SrcOpIdx2 == 1)) {
+    if (pos == 1)
+      RetOpc = Opc;
+    else if (pos == 2)
+      RetOpc = OpcodeAlts[i].Opc3;
+    else
+      RetOpc = OpcodeAlts[i].Opc2;
+  }
+  else if ((SrcOpIdx1 == 2 && SrcOpIdx2 == 3) ||
+           (SrcOpIdx1 == 3 && SrcOpIdx2 == 2)) {
+    if (pos == 1)
+      RetOpc = OpcodeAlts[i].Opc2;
+    else if (pos == 2)
+      RetOpc = OpcodeAlts[i].Opc1;
+    else
+      RetOpc = Opc;
+  }
+
+  return RetOpc;
+}
+
+/// Returns true iff the routine could find two commutable operands in the
+/// given machine instruction.
+/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
+/// input values can be re-defined in this method only if the input values
+/// are not pre-defined, which is designated by the special value ~0U
+/// assigned to it.
+/// If both of indices are pre-defined and refer to some operands, then the
+/// method simply returns true if the corresponding operands are commutable
+/// and returns false otherwise.
+///
+/// For example, calling this method this way:
+///     unsigned Op1 = 1, Op2 = ~0U;
+///     findCommutedOpIndices(MI, Op1, Op2);
+/// can be interpreted as a query asking to find an operand that would be
+/// commutable with the operand#1.
+///
+bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                         unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   switch (MI->getOpcode()) {
     case X86::CMPPDrri:
@@ -3150,46 +3557,24 @@
       // Ordered/Unordered/Equal/NotEqual tests
       unsigned Imm = MI->getOperand(3).getImm() & 0x7;
       switch (Imm) {
-      case 0x00: // EQUAL
-      case 0x03: // UNORDERED
-      case 0x04: // NOT EQUAL
-      case 0x07: // ORDERED
-        SrcOpIdx1 = 1;
-        SrcOpIdx2 = 2;
-        return true;
+        case 0x00: // EQUAL
+        case 0x03: // UNORDERED
+        case 0x04: // NOT EQUAL
+        case 0x07: // ORDERED
+          // The indices of the commutable operands are 1 and 2.
+          // Assign them to the returned operand indices here.
+          return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
       }
       return false;
     }
-    case X86::VFMADDPDr231r:
-    case X86::VFMADDPSr231r:
-    case X86::VFMADDSDr231r:
-    case X86::VFMADDSSr231r:
-    case X86::VFMSUBPDr231r:
-    case X86::VFMSUBPSr231r:
-    case X86::VFMSUBSDr231r:
-    case X86::VFMSUBSSr231r:
-    case X86::VFNMADDPDr231r:
-    case X86::VFNMADDPSr231r:
-    case X86::VFNMADDSDr231r:
-    case X86::VFNMADDSSr231r:
-    case X86::VFNMSUBPDr231r:
-    case X86::VFNMSUBPSr231r:
-    case X86::VFNMSUBSDr231r:
-    case X86::VFNMSUBSSr231r:
-    case X86::VFMADDPDr231rY:
-    case X86::VFMADDPSr231rY:
-    case X86::VFMSUBPDr231rY:
-    case X86::VFMSUBPSr231rY:
-    case X86::VFNMADDPDr231rY:
-    case X86::VFNMADDPSr231rY:
-    case X86::VFNMSUBPDr231rY:
-    case X86::VFNMSUBPSr231rY:
-      SrcOpIdx1 = 2;
-      SrcOpIdx2 = 3;
-      return true;
     default:
+      if (isFMA3(MI->getOpcode())) {
+        return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+      }
       return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
+
+  return false;
 }
 
 static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
@@ -4972,60 +5357,58 @@
   // If the instruction and target operand are commutable, commute the
   // instruction and try again.
   if (AllowCommute) {
-    unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2;
+    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = ~0U;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
       bool HasDef = MI->getDesc().getNumDefs();
       unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
       unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
       unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
-      bool Tied0 =
-          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
       bool Tied1 =
+          0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
+      bool Tied2 =
           0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
 
       // If either of the commutable operands are tied to the destination
       // then we can not commute + fold.
-      if ((HasDef && Reg0 == Reg1 && Tied0) ||
-          (HasDef && Reg0 == Reg2 && Tied1))
+      if ((HasDef && Reg0 == Reg1 && Tied1) ||
+          (HasDef && Reg0 == Reg2 && Tied2))
         return nullptr;
 
-      if ((CommuteOpIdx1 == OriginalOpIdx) ||
-          (CommuteOpIdx2 == OriginalOpIdx)) {
-        MachineInstr *CommutedMI = commuteInstruction(MI, false);
-        if (!CommutedMI) {
-          // Unable to commute.
-          return nullptr;
-        }
-        if (CommutedMI != MI) {
-          // New instruction. We can't fold from this.
-          CommutedMI->eraseFromParent();
-          return nullptr;
-        }
+      MachineInstr *CommutedMI = commuteInstruction(MI, false,
+                                                    CommuteOpIdx1,
+                                                    CommuteOpIdx2);
+      if (!CommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (CommutedMI != MI) {
+        // New instruction. We can't fold from this.
+        CommutedMI->eraseFromParent();
+        return nullptr;
+      }
 
-        // Attempt to fold with the commuted version of the instruction.
-        unsigned CommuteOp =
-            (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
-        NewMI =
-            foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align,
-                                  /*AllowCommute=*/false);
-        if (NewMI)
-          return NewMI;
-
-        // Folding failed again - undo the commute before returning.
-        MachineInstr *UncommutedMI = commuteInstruction(MI, false);
-        if (!UncommutedMI) {
-          // Unable to commute.
-          return nullptr;
-        }
-        if (UncommutedMI != MI) {
-          // New instruction. It doesn't need to be kept.
-          UncommutedMI->eraseFromParent();
-          return nullptr;
-        }
+      // Attempt to fold with the commuted version of the instruction.
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
+                                    Size, Align, /*AllowCommute=*/false);
+      if (NewMI)
+        return NewMI;
 
-        // Return here to prevent duplicate fuse failure report.
+      // Folding failed again - undo the commute before returning.
+      MachineInstr *UncommutedMI = commuteInstruction(MI, false,
+                                                      CommuteOpIdx1,
+                                                      CommuteOpIdx2);
+      if (!UncommutedMI) {
+        // Unable to commute.
+        return nullptr;
+      }
+      if (UncommutedMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        UncommutedMI->eraseFromParent();
         return nullptr;
       }
+
+      // Return here to prevent duplicate fuse failure report.
+      return nullptr;
     }
   }
 
Index: llvm/test/CodeGen/X86/fma-commute-x86.ll
===================================================================
--- llvm/test/CodeGen/X86/fma-commute-x86.ll
+++ llvm/test/CodeGen/X86/fma-commute-x86.ll
@@ -0,0 +1,312 @@
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmadd132ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmadd231ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmadd213ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmadd132ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmadd231ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmadd213ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmadd132pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmadd231pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmadd213pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmadd132pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmadd231pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmadd213pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmadd132ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmadd231ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmadd213ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmadd132ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmadd231ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmadd213ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmadd132pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmadd231pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmadd213pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmadd132pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmadd231pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmadd213pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmsub132ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmsub231ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fmsub213ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmsub132ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmsub231ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fmsub213ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmsub132pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmsub231pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fmsub213pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmsub132pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmsub231pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fmsub213pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmsub132ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmsub231ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) {
+  ; CHECK: fnmsub213ps {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmsub132ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmsub231ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) {
+  ; CHECK: fnmsub213ps {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+  ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmsub132pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmsub231pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) {
+  ; CHECK: fnmsub213pd {{.*%r.*}}, %xmm0, %xmm0
+  %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmsub132pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmsub231pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) {
+  ; CHECK: fnmsub213pd {{.*%r.*}}, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+  ret <4 x double> %res
+}
+
Index: llvm/test/CodeGen/X86/fma_patterns.ll
===================================================================
--- llvm/test/CodeGen/X86/fma_patterns.ll
+++ llvm/test/CodeGen/X86/fma_patterns.ll
@@ -134,7 +134,7 @@
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
+; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fnmadd_ss
 ; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
@@ -146,7 +146,7 @@
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
+; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fnmadd_sd
 ; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
@@ -158,7 +158,7 @@
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
+; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmsub_sd
 ; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
@@ -170,7 +170,7 @@
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
+; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fnmsub_ss
 ; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
@@ -183,8 +183,7 @@
 }
 
 ; CHECK: test_x86_fmadd_ps_load
-; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: vfmadd213ps     %xmm1, %xmm2, %xmm0
+; CHECK: vfmadd132ps	(%rdi), %xmm1, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmadd_ps_load
 ; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
@@ -197,8 +196,7 @@
 }
 
 ; CHECK: test_x86_fmsub_ps_load
-; CHECK: vmovaps         (%rdi), %xmm2
-; CHECK: fmsub213ps     %xmm1, %xmm2, %xmm0
+; CHECK: vfmsub132ps	(%rdi), %xmm1, %xmm0
 ; CHECK: ret
 ; CHECK_FMA4: test_x86_fmsub_ps_load
 ; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0