Index: llvm/lib/Target/X86/X86InstrFMA.td =================================================================== --- llvm/lib/Target/X86/X86InstrFMA.td +++ llvm/lib/Target/X86/X86InstrFMA.td @@ -164,15 +164,15 @@ // sence they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. // instructions. // -// FIXME: The FMA*_Int instructions are TEMPORARILY defined as NOT commutable. +// All of the FMA*_Int opcodes are defined as commutable here. // Commuting the 2nd and 3rd source register operands of FMAs is quite trivial -// and the corresponding optimization has been developed (please see -// http://reviews.llvm.org/D13269 for details). The optimization though needs -// some minor tuning to enable it for FMA*_Int opcodes. +// and the corresponding optimizations have been developed. // Commuting the 1st operand of FMA*_Int requires some additional analysis, // the commute optimization is legal only if all users of FMA*_Int use only -// the lowest element of the FMA*_Int instruction. -let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1, +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemened yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, hasSideEffects = 0 in multiclass fma3s_rm_int opc, string OpcodeStr, Operand memopr, RegisterClass RC> { Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2975,110 +2975,137 @@ /// Otherwise, returns false. static bool isFMA3(unsigned Opcode) { switch (Opcode) { - case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: - case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: - case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: - case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: - case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: - case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: - case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: - case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: - - case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: - case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: - case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: - case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: - case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: - case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: - case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: - case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: - - case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: - case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: - case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: - case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: - case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: - case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: - case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: - case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: - - case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: - case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: - case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: - case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: - case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: - case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: - case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: - case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: - - case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: - case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: - case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: - case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: - case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: - case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: - case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: - case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: - case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: - case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: - case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: - case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: - case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: - case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: - case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: - case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: - - case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: - case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: - case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: - case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: - case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: - case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: - case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: - case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: - - case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: - case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: - case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: - case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: - case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: - case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: - case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: - case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: - case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: - case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: - case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: - case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: - case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: - case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: - case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: - case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: - - case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: - case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: - case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: - case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: - case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: - case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: - case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: - case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: - - case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: - case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: - case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: - case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: - case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: - case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: - case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: - case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: - case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: - case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: - case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: - case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: - case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: - case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: - case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: - case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: return true; default: return false; @@ -3404,7 +3431,7 @@ { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, - + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, @@ -3449,32 +3476,86 @@ { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + const unsigned Form132Index = 0; const unsigned Form213Index = 1; const unsigned Form231Index = 2; const unsigned FormsNum = 3; + const unsigned *FoundOpcodesGroup = nullptr; + unsigned FormIndex; + // Look for the input opcode in the OpcodeGroups table. - unsigned OpcodeGroupsNum = sizeof(OpcodeGroups) / sizeof(OpcodeGroups[0]); - unsigned GroupIndex = 0, FormIndex = FormsNum; - for (; GroupIndex < OpcodeGroupsNum && FormIndex == FormsNum; GroupIndex++) { + unsigned GroupsNum = sizeof(OpcodeGroups) / sizeof(OpcodeGroups[0]); + unsigned GroupIndex = 0; + for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) { for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) { - if (OpcodeGroups[GroupIndex][FormIndex] == Opc) + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; break; + } + } + } + + // Look for the input opcode in the IntrinOpcodeGroups table. + bool IsIntrinOpcode = false; + if (FoundOpcodesGroup == nullptr) { + GroupsNum = sizeof(IntrinOpcodeGroups) / sizeof(IntrinOpcodeGroups[0]); + GroupIndex = 0; + for (; GroupIndex < GroupsNum && !FoundOpcodesGroup; GroupIndex++) { + for (FormIndex = 0; FormIndex < FormsNum; FormIndex++) { + if (IntrinOpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = IntrinOpcodeGroups[GroupIndex]; + IsIntrinOpcode = true; + break; + } + } } } - // Input opcode does not match with any of the opcodes from the table. - if (FormIndex == FormsNum) + + // The input opcode does not match with any of the opcodes from the tables. + if (FoundOpcodesGroup == nullptr) return 0; - // Do not forget to fix the GroupIndex after the loop. - GroupIndex--; // Put the lowest index to SrcOpIdx1 to simplify the checks below. if (SrcOpIdx1 > SrcOpIdx2) std::swap(SrcOpIdx1, SrcOpIdx2); + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + unsigned Case; - if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) Case = 0; else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) Case = 1; @@ -3506,7 +3587,7 @@ // Everything is ready, just adjust the FMA opcode and return it. FormIndex = FormMapping[Case][FormIndex]; - return OpcodeGroups[GroupIndex][FormIndex]; + return FoundOpcodesGroup[FormIndex]; } bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, Index: llvm/test/CodeGen/X86/fma-commute-x86.ll =================================================================== --- llvm/test/CodeGen/X86/fma-commute-x86.ll +++ llvm/test/CodeGen/X86/fma-commute-x86.ll @@ -4,6 +4,38 @@ attributes #0 = { nounwind } +declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_baa_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_aba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfmadd132ss (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_bba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfmadd213ss (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_x86_fmadd_baa_ps: @@ -66,6 +98,38 @@ ret <8 x float> %res } +declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_baa_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_aba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmadd_bba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_x86_fmadd_baa_pd: @@ -129,6 +193,37 @@ } +declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_baa_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_aba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfnmadd132ss (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_bba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfnmadd213ss (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 { @@ -192,6 +287,38 @@ ret <8 x float> %res } +declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_baa_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_aba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmadd_bba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_x86_fnmadd_baa_pd: @@ -255,6 +382,38 @@ } +declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_baa_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_aba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfmsub132ss (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_bba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfmsub213ss (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_x86_fmsub_baa_ps: @@ -317,6 +476,38 @@ ret <8 x float> %res } +declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_baa_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_aba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fmsub_bba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_x86_fmsub_baa_pd: @@ -380,6 +571,38 @@ } +declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_baa_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_aba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfnmsub132ss (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_bba_ss: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfnmsub213ss (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_x86_fnmsub_baa_ps: @@ -442,6 +665,38 @@ ret <8 x float> %res } +declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_baa_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}} +; CHECK-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_aba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rcx), %xmm0 +; CHECK-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_x86_fnmsub_bba_sd: +; CHECK: # BB#0: +; CHECK-NEXT: vmovaps (%rdx), %xmm0 +; CHECK-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 { ; CHECK-LABEL: test_x86_fnmsub_baa_pd: Index: llvm/test/CodeGen/X86/fma-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/fma-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -10,9 +10,9 @@ ; CHECK-LABEL: test_x86_fma_vfmadd_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdi)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 ; @@ -27,9 +27,9 @@ ; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -45,9 +45,9 @@ ; CHECK-LABEL: test_x86_fma_vfmadd_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 ; @@ -62,9 +62,9 @@ ; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -154,9 +154,9 @@ ; CHECK-LABEL: test_x86_fma_vfmsub_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 ; @@ -171,9 +171,9 @@ ; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -190,9 +190,9 @@ ; CHECK-LABEL: test_x86_fma_vfmsub_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 ; @@ -207,9 +207,9 @@ ; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -299,9 +299,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmadd_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 ; @@ -316,9 +316,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -335,9 +335,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmadd_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 ; @@ -352,9 +352,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -444,9 +444,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmsub_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 ; @@ -461,9 +461,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 @@ -480,9 +480,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmsub_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rcx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 ; @@ -497,9 +497,9 @@ ; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd: ; CHECK-NEXT: # BB#0: ; -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}} -; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vmovap{{s|d}} {{\(%rdx\), %xmm0|\(%r8\), %xmm1}} +; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0 ; ; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 ; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0