This is an archive of the discontinued LLVM Phabricator instance.

[X86][FMA] Switch FMA3 form to most suitable for each given case.
AbandonedPublic

Authored by zinovy.nis on Jul 22 2014, 11:23 AM.

Download Raw Diff

Details

Reviewers

spatel
nadav
hfinkel
lhames

Summary

This patch extends the existing FMA form switching algorithm (X86TargetLowering::emitFMA3Instr) with 3 more heuristics for rearranging operands:

operand defined via instruction with canFoldAsLoad()==true moves to 3rd place to help memory folding and to save a phys register.
if FMA result is written into phys register and one of operands is defined by copying from the same phys register, then make this operand 1st to eliminate excessive COPY.
prefer to make kill> operand 1st as it can help to re-use phys. registers.

This patch fixes cases from http://llvm.org/bugs/show_bug.cgi?id=17229

The issue from http://llvm.org/bugs/show_bug.cgi?id=20043 is also fixed by this patch, but later TwoAddressInstructionPass decides
to commute operands I arranged and re-creates excessive COPY. So it needs to be investigated and fixed later.

I haven't seen large performance gain of this path for internal benchmarks we use (Elena D. in http://permalink.gmane.org/gmane.comp.compilers.llvm.devel/69035 was right :-) ), but lots of MOVAPS disappeared and the number stack operations also decreased. I also have not seen noticeable regressions for benchmarks I run.

Diff Detail

Event Timeline

zinovy.nis updated this revision to Diff 11768.Jul 22 2014, 11:23 AM

zinovy.nis retitled this revision from to [X86][FMA] Switch FMA3 form to most suitable for each given case..

zinovy.nis updated this object.

zinovy.nis edited the test plan for this revision. (Show Details)

zinovy.nis added reviewers: nadav, hfinkel, spatel.

zinovy.nis set the repository for this revision to rL LLVM.

zinovy.nis added a reviewer: lhames.Jul 22 2014, 11:25 AM

zinovy.nis added a subscriber: Unknown Object (MLST).Jul 22 2014, 1:16 PM

Abandon? FMA commutation/folding was added by v_klochkov

zinovy.nis abandoned this revision.Nov 27 2015, 2:46 AM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

334 lines

X86InstrFMA.td

8 lines

test/

CodeGen/

X86/

fma_patterns.ll

6 lines

Diff 11768

lib/Target/X86/X86ISelLowering.cpp

Property	Old Value	New Value
File Mode	100644	100755

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"		#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"		#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"		#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"		#include "llvm/Target/TargetOptions.h"
#include <bitset>		#include <bitset>
#include <numeric>		#include <numeric>
#include <cctype>		#include <cctype>
		#include <array>
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "x86-isel"		#define DEBUG_TYPE "x86-isel"

STATISTIC(NumTailCalls, "Number of tail calls");		STATISTIC(NumTailCalls, "Number of tail calls");

static cl::opt<bool> ExperimentalVectorWideningLegalization(		static cl::opt<bool> ExperimentalVectorWideningLegalization(
"x86-experimental-vector-widening-legalization", cl::init(false),		"x86-experimental-vector-widening-legalization", cl::init(false),
▲ Show 20 Lines • Show All 17,867 Lines • ▼ Show 20 Lines	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
MIB.setMemRefs(MMOBegin, MMOEnd);		MIB.setMemRefs(MMOBegin, MMOEnd);
// Jump		// Jump
BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);		BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

MI->eraseFromParent();		MI->eraseFromParent();
return MBB;		return MBB;
}		}

// Replace 213-type (isel default) FMA3 instructions with 231-type for		/// \brief Switches FMA213xxxx into FMA231xxxx form.
		static unsigned switchFMA213To231(unsigned OldFMAOpc) {
		switch (OldFMAOpc) {
		case X86::VFMADDPDr213r:
		return X86::VFMADDPDr231r;
		case X86::VFMADDPSr213r:
		return X86::VFMADDPSr231r;
		case X86::VFMADDSDr213r:
		return X86::VFMADDSDr231r;
		case X86::VFMADDSSr213r:
		return X86::VFMADDSSr231r;
		case X86::VFMSUBPDr213r:
		return X86::VFMSUBPDr231r;
		case X86::VFMSUBPSr213r:
		return X86::VFMSUBPSr231r;
		case X86::VFMSUBSDr213r:
		return X86::VFMSUBSDr231r;
		case X86::VFMSUBSSr213r:
		return X86::VFMSUBSSr231r;
		case X86::VFNMADDPDr213r:
		return X86::VFNMADDPDr231r;
		case X86::VFNMADDPSr213r:
		return X86::VFNMADDPSr231r;
		case X86::VFNMADDSDr213r:
		return X86::VFNMADDSDr231r;
		case X86::VFNMADDSSr213r:
		return X86::VFNMADDSSr231r;
		case X86::VFNMSUBPDr213r:
		return X86::VFNMSUBPDr231r;
		case X86::VFNMSUBPSr213r:
		return X86::VFNMSUBPSr231r;
		case X86::VFNMSUBSDr213r:
		return X86::VFNMSUBSDr231r;
		case X86::VFNMSUBSSr213r:
		return X86::VFNMSUBSSr231r;
		case X86::VFMADDPDr213rY:
		return X86::VFMADDPDr231rY;
		case X86::VFMADDPSr213rY:
		return X86::VFMADDPSr231rY;
		case X86::VFMSUBPDr213rY:
		return X86::VFMSUBPDr231rY;
		case X86::VFMSUBPSr213rY:
		return X86::VFMSUBPSr231rY;
		case X86::VFNMADDPDr213rY:
		return X86::VFNMADDPDr231rY;
		case X86::VFNMADDPSr213rY:
		return X86::VFNMADDPSr231rY;
		case X86::VFNMSUBPDr213rY:
		return X86::VFNMSUBPDr231rY;
		case X86::VFNMSUBPSr213rY:
		return X86::VFNMSUBPSr231rY;
		default:
		llvm_unreachable("Unrecognized FMA variant.");
		}
		}

		/// \brief Switches FMA213xxxx into FMA132xxxx form.
		static unsigned switchFMA213To132(unsigned OldFMAOpc) {
		switch (OldFMAOpc) {
		case X86::VFMADDPDr213r:
		return X86::VFMADDPDr132r;
		case X86::VFMADDPSr213r:
		return X86::VFMADDPSr132r;
		case X86::VFMADDSDr213r:
		return X86::VFMADDSDr132r;
		case X86::VFMADDSSr213r:
		return X86::VFMADDSSr132r;
		case X86::VFMSUBPDr213r:
		return X86::VFMSUBPDr132r;
		case X86::VFMSUBPSr213r:
		return X86::VFMSUBPSr132r;
		case X86::VFMSUBSDr213r:
		return X86::VFMSUBSDr132r;
		case X86::VFMSUBSSr213r:
		return X86::VFMSUBSSr132r;
		case X86::VFNMADDPDr213r:
		return X86::VFNMADDPDr132r;
		case X86::VFNMADDPSr213r:
		return X86::VFNMADDPSr132r;
		case X86::VFNMADDSDr213r:
		return X86::VFNMADDSDr132r;
		case X86::VFNMADDSSr213r:
		return X86::VFNMADDSSr132r;
		case X86::VFNMSUBPDr213r:
		return X86::VFNMSUBPDr132r;
		case X86::VFNMSUBPSr213r:
		return X86::VFNMSUBPSr132r;
		case X86::VFNMSUBSDr213r:
		return X86::VFNMSUBSDr132r;
		case X86::VFNMSUBSSr213r:
		return X86::VFNMSUBSSr132r;
		case X86::VFMADDPDr213rY:
		return X86::VFMADDPDr132rY;
		case X86::VFMADDPSr213rY:
		return X86::VFMADDPSr132rY;
		case X86::VFMSUBPDr213rY:
		return X86::VFMSUBPDr132rY;
		case X86::VFMSUBPSr213rY:
		return X86::VFMSUBPSr132rY;
		case X86::VFNMADDPDr213rY:
		return X86::VFNMADDPDr132rY;
		case X86::VFNMADDPSr213rY:
		return X86::VFNMADDPSr132rY;
		case X86::VFNMSUBPDr213rY:
		return X86::VFNMSUBPDr132rY;
		case X86::VFNMSUBPSr213rY:
		return X86::VFNMSUBPSr132rY;
		default:
		llvm_unreachable("Unrecognized FMA variant.");
		}
		}

		/// \brief Returns true if the given machine operand mio
		/// is defined via a full copy from the given physical register physReg.
		static bool isOperandFullCopyFromReg(MachineRegisterInfo &MRI,
		MachineOperand &mio, unsigned physReg) {
		assert(mio.isReg());
		auto &def = *MRI.def_instr_begin(mio.getReg());
		return def.isFullCopy() && def.getOperand(1).getReg() == physReg;
		}

		/// \brief Given the operands permutation (for ex. [0,2,3,1]),
		/// returns FMA form index for which produces %0 = %1 * %2 + %3.
		static unsigned canonicalizeFMA(const std::array<char, 4> &Operands) {
		const unsigned From = 100 * Operands[1] + 10 * Operands[2] + 1 * Operands[3];
		switch (From) {
		case 132:
		case 231:
		return 132;
		case 123:
		case 213:
		return 213;
		case 312:
		case 321:
		return 231;
		default:
		llvm_unreachable("Unexpected FMA form");
		}
		}

		// Try optimize FMA3 in 3 ways:
		// 1) Replace 213-type (isel default) FMA3 instructions with 231-type for
// accumulator loops. Writing back to the accumulator allows the coalescer		// accumulator loops. Writing back to the accumulator allows the coalescer
// to remove extra copies in the loop.		// to remove extra copies in the loop.
MachineBasicBlock *		// 2) If 1st operand (tied with dest) is used only for copying to physical reg,
X86TargetLowering::emitFMA3Instr(MachineInstr *MI,		// then look for operand which is defined by copying from this phys.
MachineBasicBlock *MBB) const {		// register.
MachineOperand &AddendOp = MI->getOperand(3);		// If such operand found, make it a 1st to eliminate excessive MOVs.
		// 3) Try to place killed operands to the 1st place. It can save phys register
		// and avoid copying.
		// 4) Make memory loading operand a 3rd to help folding.
		//
		// First we permute operands and then find the most suitable FMA form for it.
		MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr MI, MachineBasicBlock MBB) const {

// Bail out early if the addend isn't a register - we can't switch these.		// Sanity checks.
if (!AddendOp.isReg())		assert(MI->getNumOperands() == 4 && "FMA3 must have 4 operands.");
		// Expect registers only: no mem, no immediates.
		for (unsigned i = 0; i < MI->getNumOperands(); ++i)
		if (!MI->getOperand(i).isReg())
return MBB;		return MBB;

		// Initial operands permutation corresponding to FMA213.
		const std::array<char, 4> OperandsId = {0, 1, 2, 3};
		// Optimized operands permutation.
		std::array<char, 4> Operands = OperandsId;
		// 1st operand index was optimized earlier and must not be changed.
		bool FirstFixed = false;

MachineFunction &MF = *MBB->getParent();		MachineFunction &MF = *MBB->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();		MachineRegisterInfo &MRI = MF.getRegInfo();

// Check whether the addend is defined by a PHI:		// 1) Look for the following pattern:
assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
if (!AddendDef.isPHI())
return MBB;

// Look for the following pattern:
// loop:		// loop:
// %addend = phi [%entry, 0], [%loop, %result]		// %addend = phi [%entry, 0], [%loop, %result]
// ...		// ...
// %result<tied1> = FMA213 %m2<tied0>, %m1, %addend		// %result<tied1> = FMA213 %m2<tied0>, %m1, %addend

// Replace with:		// Replace with:
// loop:		// loop:
// %addend = phi [%entry, 0], [%loop, %result]		// %addend = phi [%entry, 0], [%loop, %result]
// ...		// ...
// %result<tied1> = FMA231 %addend<tied0>, %m1, %m2		// %result<tied1> = FMA231 %addend<tied0>, %m1, %m2

		if (!FirstFixed) {
		const unsigned AddendIndex = 3;
		MachineOperand &AddendOp = MI->getOperand(Operands[AddendIndex]);
		// Check whether the addend is defined by a PHI:
		assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
		MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
		if (AddendDef.isPHI()) {
for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {		for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
assert(AddendDef.getOperand(i).isReg());
MachineOperand PHISrcOp = AddendDef.getOperand(i);		MachineOperand PHISrcOp = AddendDef.getOperand(i);
MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());		MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
if (&PHISrcInst == MI) {		if (&PHISrcInst == MI) {
// Found a matching instruction.		std::swap(Operands[1], Operands[AddendIndex]);
unsigned NewFMAOpc = 0;		// Fix the first operand, so it cannot be replaced later.
switch (MI->getOpcode()) {		FirstFixed = true;
case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;		break;
case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;		}
case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;		}
case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;		}
case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
default: llvm_unreachable("Unrecognized FMA variant.");
}		}

		// Try to change destination registers only if this FMA has exactly 1 usage.
		if (MRI.hasOneUse(MI->getOperand(Operands[0]).getReg())) {
		// 2) Look for pattern
		//
		// %a = ...
		// %b = ...
		// %c = ...
		// %res = FMAxxx %a, %b, %c
		// %PHYS_REG = COPY %res<kill>
		//
		// and replace it with FMA form where first operand is copied
		// directly from %PHYS_REG, if possible. It eliminates COPY after FMA.
		//
		if (!FirstFixed) {
		do {
		// a) Check that FMA result is copied into phys reg.
		auto UseOfFMADefIt =
		MRI.use_instr_begin(MI->getOperand(Operands[0]).getReg());
		// If this FMA is useless, do nothing: it will be removed later by DCE.
		if (UseOfFMADefIt == MRI.use_instr_end())
		break;
		MachineInstr &UseOfFMADef = *UseOfFMADefIt;
		const unsigned FMADefReg = UseOfFMADef.getOperand(0).getReg();
		if (!UseOfFMADef.isFullCopy() \|\|
		!TargetRegisterInfo::isPhysicalRegister(FMADefReg))
		break;

		// b) Find the operand which is defined by copying from the same phys
		// register to which FMA writes.
		// Also check the first operand: if it matches, then the existing FMA
		// schema is optimal.
		for (unsigned i = 1; i <= 3; ++i)
		if (isOperandFullCopyFromReg(MRI, MI->getOperand(Operands[i]),
		FMADefReg)) {
		// Fix 1st operand.
		FirstFixed = true;
		std::swap(Operands[1], Operands[i]);
		break;
		}
		} while (false);
		}

		// 3) Find first killed operand and make it first, if possible, to reuse its
		// physical register.
		if (!FirstFixed) {
		do {
		for (unsigned i = 1; i <= 3; ++i) {
		MachineOperand &Operand = MI->getOperand(Operands[i]);
		if (Operand.isKill()) {
		FirstFixed = true;
		std::swap(Operands[1], Operands[i]);
		break;
		}
		}
		} while (false);
		}
		}

		// 4) Try to make memory accessing operand the 3rd to help memory folding.
		// We can't move 1st operand if it's already fixed on prev. step.
		for (unsigned i = 3; i > (FirstFixed ? 1 : 0); --i) {
		MachineOperand &Operand = MI->getOperand(Operands[i]);
		if (MRI.def_instr_begin(Operand.getReg())->canFoldAsLoad()) {
		// Operand may load memory, so move it to 3d place to have a chance of
		// folding.
		std::swap(Operands[3], Operands[i]);
		break;
		}
		}

		// Operands order changed? We need to regenerate the instruction.
		if (Operands != OperandsId) {
		const unsigned NewFMAOp = canonicalizeFMA(Operands);
		assert(NewFMAOp == 132 \|\| NewFMAOp == 213 \|\| NewFMAOp == 231);
		const unsigned CurrFMAOpc = MI->getOpcode();
		unsigned NewFMAOpc;
		if (NewFMAOp == 132)
		NewFMAOpc = switchFMA213To132(CurrFMAOpc);
		else if (NewFMAOp == 231)
		NewFMAOpc = switchFMA213To231(CurrFMAOpc);
		else if (NewFMAOp == 213)
		NewFMAOpc = CurrFMAOpc;
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();		const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
MachineInstrBuilder MIB =		MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
.addOperand(MI->getOperand(0))		.addOperand(MI->getOperand(0))
.addOperand(MI->getOperand(3))		.addOperand(MI->getOperand(Operands[1]))
.addOperand(MI->getOperand(2))		.addOperand(MI->getOperand(Operands[2]))
.addOperand(MI->getOperand(1));		.addOperand(MI->getOperand(Operands[3]));
MBB->insert(MachineBasicBlock::iterator(MI), MIB);		MBB->insert(MachineBasicBlock::iterator(MI), MIB);
MI->eraseFromParent();		MI->eraseFromParent();
}		}
}

return MBB;		return MBB;
}		}

MachineBasicBlock *		MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,		X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {		MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {		switch (MI->getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");		default: llvm_unreachable("Unexpected instr type to insert");
▲ Show 20 Lines • Show All 4,869 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrFMA.td

Show First 20 Lines • Show All 66 Lines • ▼ Show 20 Lines	defm r213 : fma3p_rm<opc213,
!strconcat(OpcodeStr, "213", PackTy),		!strconcat(OpcodeStr, "213", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 1,		/* IsMVariantCommutable */ 1,
Op>;		Op>;
let neverHasSideEffects = 1 in {		let neverHasSideEffects = 1 in {
defm r132 : fma3p_rm<opc132,		defm r132 : fma3p_rm<opc132,
!strconcat(OpcodeStr, "132", PackTy),		!strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;		MemFrag128, MemFrag256, OpTy128, OpTy256,
		/* IsRVariantCommutable */ 1,
		/* IsMVariantCommutable */ 0>;
// For 231, only the register variant is commutable.		// For 231, only the register variant is commutable.
// For the memory variant the folded operand must be in 3. Thus,		// For the memory variant the folded operand must be in 3. Thus,
// in that case, it cannot be swapped with 2.		// in that case, it cannot be swapped with 2.
defm r231 : fma3p_rm<opc231,		defm r231 : fma3p_rm<opc231,
!strconcat(OpcodeStr, "231", PackTy),		!strconcat(OpcodeStr, "231", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256,		MemFrag128, MemFrag256, OpTy128, OpTy256,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 0>;
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines

multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,		multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string PT2, Intrinsic Int,		string OpStr, string PackTy, string PT2, Intrinsic Int,
SDNode OpNode, RegisterClass RC, ValueType OpVT,		SDNode OpNode, RegisterClass RC, ValueType OpVT,
X86MemOperand x86memop, Operand memop, PatFrag mem_frag,		X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
ComplexPattern mem_cpat> {		ComplexPattern mem_cpat> {
let neverHasSideEffects = 1 in {		let neverHasSideEffects = 1 in {
defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),		defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
x86memop, RC, OpVT, mem_frag>;		x86memop, RC, OpVT, mem_frag,
		/* IsRVariantCommutable */ 1,
		/* IsMVariantCommutable */ 0>;
// See the other defm of r231 for the explanation regarding the		// See the other defm of r231 for the explanation regarding the
// commutable flags.		// commutable flags.
defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),		defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
x86memop, RC, OpVT, mem_frag,		x86memop, RC, OpVT, mem_frag,
/* IsRVariantCommutable */ 1,		/* IsRVariantCommutable */ 1,
/* IsMVariantCommutable */ 0>;		/* IsMVariantCommutable */ 0>;
}		}

▲ Show 20 Lines • Show All 222 Lines • Show Last 20 Lines

test/CodeGen/X86/fma_patterns.ll

	Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines
	define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {			define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
	%x = fsub float -0.000000e+00, %a0			%x = fsub float -0.000000e+00, %a0
	%y = fmul float %x, %a1			%y = fmul float %x, %a1
	%res = fsub float %y, %a2			%res = fsub float %y, %a2
	ret float %res			ret float %res
	}			}

	; CHECK: test_x86_fmadd_ps_load			; CHECK: test_x86_fmadd_ps_load
	; CHECK: vmovaps (%rdi), %xmm2			; CHECK vfmadd132ps (%rdi), %xmm1, %xmm0
	; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fmadd_ps_load			; CHECK_FMA4: test_x86_fmadd_ps_load
	; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0			; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
	%x = load <4 x float>* %a0			%x = load <4 x float>* %a0
	%y = fmul <4 x float> %x, %a1			%y = fmul <4 x float> %x, %a1
	%res = fadd <4 x float> %y, %a2			%res = fadd <4 x float> %y, %a2
	ret <4 x float> %res			ret <4 x float> %res
	}			}

	; CHECK: test_x86_fmsub_ps_load			; CHECK: test_x86_fmsub_ps_load
	; CHECK: vmovaps (%rdi), %xmm2			; CHECK: vfmsub132ps (%rdi), %xmm1, %xmm0
	; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2
	; CHECK: ret			; CHECK: ret
	; CHECK_FMA4: test_x86_fmsub_ps_load			; CHECK_FMA4: test_x86_fmsub_ps_load
	; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0			; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
	; CHECK_FMA4: ret			; CHECK_FMA4: ret
	define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {			define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
	%x = load <4 x float>* %a0			%x = load <4 x float>* %a0
	%y = fmul <4 x float> %x, %a1			%y = fmul <4 x float> %x, %a1
	%res = fsub <4 x float> %y, %a2			%res = fsub <4 x float> %y, %a2
	ret <4 x float> %res			ret <4 x float> %res
	}			}