Index: llvm/lib/Target/X86/X86InstrFMA.td =================================================================== --- llvm/lib/Target/X86/X86InstrFMA.td +++ llvm/lib/Target/X86/X86InstrFMA.td @@ -60,27 +60,47 @@ string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - // For 213, both the register and memory variant are commutable. - // Indeed, the commutable operands are 1 and 2 and both live in registers - // for both variants. +let hasSideEffects = 0 in { + // For 213, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); + // operands 1 and 3 (register forms only): *213* --> *231*; + // operands 2 and 3 (register forms only): *213* --> *132*. defm r213 : fma3p_rm; -let hasSideEffects = 0 in { + // For 132, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *132* --> *231*; + // operands 1 and 3 (register forms only): *132* --> *132*(no changes); + // operands 2 and 3 (register forms only): *132* --> *213*. defm r132 : fma3p_rm; - // For 231, only the register variant is commutable. + MemFrag128, MemFrag256, OpTy128, OpTy256, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1>; + // For 231, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. // For the memory variant the folded operand must be in 3. Thus, - // in that case, it cannot be swapped with 2. + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *231* --> *132*; + // operands 1 and 3 (register forms only): *231* --> *213*; + // operands 2 and 3 (register forms only): *231* --> *231*(no changes). defm r231 : fma3p_rm; + /* IsMVariantCommutable */ 1>; } // hasSideEffects = 0 } @@ -156,23 +176,54 @@ X86MemOperand x86memop, Operand memop, PatFrag mem_frag, ComplexPattern mem_cpat> { let hasSideEffects = 0 in { + // For 132, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *132* --> *231*; + // operands 1 and 3 (register forms only): *132* --> *132*(no changes); + // operands 2 and 3 (register forms only): *132* --> *213*. + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. defm r132 : fma3s_rm; - // See the other defm of r231 for the explanation regarding the - // commutable flags. + x86memop, RC, OpVT, mem_frag, + /* IsRVariantCommutable */ 1, + /* IsMVariantCommutable */ 1>; + // For 231, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *231* --> *132*; + // operands 1 and 3 (register forms only): *231* --> *213*; + // operands 2 and 3 (register forms only): *231* --> *231*(no changes). + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. defm r231 : fma3s_rm; -} + /* IsMVariantCommutable */ 1>; -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm; + // For 213, both the register and memory variants are commutable. + // For the register form the commutable operands are 1, 2 and 3. + // For the memory variant the folded operand must be in 3. Thus, + // in that case, only the operands 1 and 2 can be swapped. + // Commuting some of operands may require the opcode change: + // operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); + // operands 1 and 3 (register forms only): *213* --> *231*; + // operands 2 and 3 (register forms only): *213* --> *132*. + // Commuting the operand 1 with some other operand changes the upper bits + // of the result FMA instruction. Thus, it requires a proof of the fact that + // only the lowest element of the result is used. + defm r213 : fma3s_rm; +} } multiclass fma3s opc132, bits<8> opc213, bits<8> opc231, Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -264,6 +264,46 @@ bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value 'CommuteAnyOperandIndex' which means that the corresponding + /// operand index is not set and this method is free to pick any of + /// available commutable operands. + /// + /// For example, calling this method this way: + /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// SrcOpIdx1 and SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2917,6 +2917,121 @@ return NewMI; } +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +static bool isFMA3(unsigned Opcode) { + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + default: + break; + } + return false; +} + MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, bool NewMI, unsigned OpIdx1, @@ -3127,10 +3242,218 @@ // Fallthrough intended. } default: + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } } +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 < 1 || + (SrcOpIdx1 > RegOpsNum && SrcOpIdx1 != CommuteAnyOperandIndex) || + SrcOpIdx2 < 1 || + (SrcOpIdx2 > RegOpsNum && SrcOpIdx2 != CommuteAnyOperandIndex)) + return false; + + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + int RetOpc = 0; + int Opc = MI->getOpcode(); + + // Struct describing FMA opcodes and dependencies between them. + static const struct { + int Opc1; + int Opc2; + int Opc3; + bool IsScalar; + } OpcodeAlts[] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r, true }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r, true }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r, false }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r, false }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY,false }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY,false }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m, true }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m, true }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m, false }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m, false }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY,false }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY,false }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r, true }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r, true }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r, false }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r, false }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY,false }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY,false }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m, true }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m, true }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m, false }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m, false }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY,false }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY,false }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r, true }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r, true }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r, false }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r, false }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY,false }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY,false }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m, true }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m, true }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m, false }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m, false }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY,false }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY,false }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r, true }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r, true }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r, false }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r, false }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY,false }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY,false }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m, true }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m, true }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m, false }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m, false }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY,false }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY,false }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r, false }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r, false }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY,false }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY,false }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m, false }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m, false }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY,false }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY,false }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r, false }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r, false }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY,false }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY,false }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m, false }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m, false }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY,false }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY,false } + }; + + unsigned char OpcodeAltsNum = sizeof(OpcodeAlts) / sizeof(OpcodeAlts[0]); + int i, pos = 0; + for (i = 0; i < OpcodeAltsNum; i++) { + if (OpcodeAlts[i].Opc2 == Opc) { + pos = 2; + break; + } + if (OpcodeAlts[i].Opc1 == Opc) { + pos = 1; + break; + } + if (OpcodeAlts[i].Opc3 == Opc) { + pos = 3; + break; + } + } + + // Input opcode does not match with any opcode from the table. + if (pos == 0) + return 0; + + // FIXME: Commuting the 1st operand of scalar FMA requires some additional + // analysis such as getting proof of the fact that all uses of the + // given FMA instruction use only the lowest element. Without proving + // that commuting the 1st operand of scalar FMAs changes the upper bits + // of the result. + if (OpcodeAlts[i].IsScalar && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) + return 0; + + // Find reversed FMA opcode. + if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 2) || + (SrcOpIdx1 == 2 && SrcOpIdx2 == 1)) { + if (pos == 1) + RetOpc = OpcodeAlts[i].Opc3; + else if (pos == 2) + RetOpc = Opc; + else + RetOpc = OpcodeAlts[i].Opc1; + } else if ((SrcOpIdx1 == 1 && SrcOpIdx2 == 3) || + (SrcOpIdx1 == 3 && SrcOpIdx2 == 1)) { + if (pos == 1) + RetOpc = Opc; + else if (pos == 2) + RetOpc = OpcodeAlts[i].Opc3; + else + RetOpc = OpcodeAlts[i].Opc2; + } else if ((SrcOpIdx1 == 2 && SrcOpIdx2 == 3) || + (SrcOpIdx1 == 3 && SrcOpIdx2 == 2)) { + if (pos == 1) + RetOpc = OpcodeAlts[i].Opc2; + else if (pos == 2) + RetOpc = OpcodeAlts[i].Opc1; + else + RetOpc = Opc; + } + + return RetOpc; +} + bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { @@ -3155,34 +3478,9 @@ } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - // The indices of the commutable operands are 2 and 3. - // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; Index: llvm/test/CodeGen/X86/fma-commute-x86.ll =================================================================== --- llvm/test/CodeGen/X86/fma-commute-x86.ll +++ llvm/test/CodeGen/X86/fma-commute-x86.ll @@ -0,0 +1,312 @@ +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s +; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s + +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmadd132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmadd231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmadd213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmadd132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmadd231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmadd213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmadd132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmadd231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmadd213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmadd132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmadd231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmadd213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + + +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmadd132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmadd231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmadd213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmadd132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmadd231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmadd213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmadd132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmadd231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmadd213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmadd132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmadd231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmadd213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmsub132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmsub231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fmsub213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmsub132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmsub231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fmsub213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmsub132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmsub231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fmsub213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmsub132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmsub231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fmsub213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + + +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone +define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmsub132ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmsub231ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) { +; CHECK: fnmsub213ps {{.*%r.*}}, %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone +define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmsub132ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmsub231ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) { +; CHECK: fnmsub213ps {{.*%r.*}}, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind + ret <8 x float> %res +} + +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmsub132pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmsub231pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) { +; CHECK: fnmsub213pd {{.*%r.*}}, %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone +define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmsub132pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmsub231pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) { +; CHECK: fnmsub213pd {{.*%r.*}}, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind + ret <4 x double> %res +} + Index: llvm/test/CodeGen/X86/fma_patterns.ll =================================================================== --- llvm/test/CodeGen/X86/fma_patterns.ll +++ llvm/test/CodeGen/X86/fma_patterns.ll @@ -176,7 +176,8 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) { ; CHECK-LABEL: test_x86_fnmadd_ss: ; CHECK: # BB#0: -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fnmadd_ss: @@ -191,7 +192,8 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) { ; CHECK-LABEL: test_x86_fnmadd_sd: ; CHECK: # BB#0: -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fnmadd_sd: @@ -206,7 +208,8 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) { ; CHECK-LABEL: test_x86_fmsub_sd: ; CHECK: # BB#0: -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fmsub_sd: @@ -221,7 +224,8 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) { ; CHECK-LABEL: test_x86_fnmsub_ss: ; CHECK: # BB#0: -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fnmsub_ss: @@ -237,8 +241,7 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) { ; CHECK-LABEL: test_x86_fmadd_ps_load: ; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fmadd_ps_load: @@ -254,8 +257,7 @@ define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) { ; CHECK-LABEL: test_x86_fmsub_ps_load: ; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %xmm2 -; CHECK-NEXT: vfmsub213ps %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_x86_fmsub_ps_load: @@ -460,7 +462,8 @@ ; CHECK-LABEL: test_f32_interp: ; CHECK: # BB#0: ; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfmadd213ss %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_f32_interp: @@ -517,7 +520,8 @@ ; CHECK-LABEL: test_f64_interp: ; CHECK: # BB#0: ; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1 -; CHECK-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfmadd213sd %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq ; ; CHECK_FMA4-LABEL: test_f64_interp: