Diff 425626

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

Show All 28 Lines	enum class MachineCombinerPattern {
REASSOC_XY_AMM_BMM,		REASSOC_XY_AMM_BMM,
REASSOC_XMM_AMM_BMM,		REASSOC_XMM_AMM_BMM,

// These are patterns matched by the PowerPC to reassociate FMA and FSUB to		// These are patterns matched by the PowerPC to reassociate FMA and FSUB to
// reduce register pressure.		// reduce register pressure.
REASSOC_XY_BCA,		REASSOC_XY_BCA,
REASSOC_XY_BAC,		REASSOC_XY_BAC,

		// These are patterns used to reduce the length of dependence chain.
		SUBADD_OP1,
		SUBADD_OP2,

// These are multiply-add patterns matched by the AArch64 machine combiner.		// These are multiply-add patterns matched by the AArch64 machine combiner.
MULADDW_OP1,		MULADDW_OP1,
MULADDW_OP2,		MULADDW_OP2,
MULSUBW_OP1,		MULSUBW_OP1,
MULSUBW_OP2,		MULSUBW_OP2,
MULADDWI_OP1,		MULADDWI_OP1,
MULSUBWI_OP1,		MULSUBWI_OP1,
MULADDX_OP1,		MULADDX_OP1,
▲ Show 20 Lines • Show All 128 Lines • Show Last 20 Lines

llvm/lib/CodeGen/MachineCombiner.cpp

Show First 20 Lines • Show All 86 Lines • ▼ Show 20 Lines	public:
void getAnalysisUsage(AnalysisUsage &AU) const override;		void getAnalysisUsage(AnalysisUsage &AU) const override;
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "Machine InstCombiner"; }		StringRef getPassName() const override { return "Machine InstCombiner"; }

private:		private:
bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);		bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
bool combineInstructions(MachineBasicBlock *);		bool combineInstructions(MachineBasicBlock *);
MachineInstr *getOperandDef(const MachineOperand &MO);		MachineInstr *getOperandDef(const MachineOperand &MO);
unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,		bool isCoalescableCopy(MachineInstr *MI);
		std::pair<unsigned, unsigned>
		getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineTraceMetrics::Trace BlockTrace);		MachineTraceMetrics::Trace BlockTrace);
unsigned getLatency(MachineInstr Root, MachineInstr NewRoot,		unsigned getLatency(MachineInstr Root, MachineInstr NewRoot,
MachineTraceMetrics::Trace BlockTrace);		MachineTraceMetrics::Trace BlockTrace);
bool		bool
improvesCriticalPathLen(MachineBasicBlock MBB, MachineInstr Root,		improvesCriticalPathLen(MachineBasicBlock MBB, MachineInstr Root,
MachineTraceMetrics::Trace BlockTrace,		MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,		SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))		if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
DefInstr = MRI->getUniqueVRegDef(MO.getReg());		DefInstr = MRI->getUniqueVRegDef(MO.getReg());
// PHI's have no depth etc.		// PHI's have no depth etc.
if (DefInstr && DefInstr->isPHI())		if (DefInstr && DefInstr->isPHI())
DefInstr = nullptr;		DefInstr = nullptr;
return DefInstr;		return DefInstr;
}		}

		/// Check if MI is a COPY instruction, and its src and dst registers can be
		/// coalesced.
		bool MachineCombiner::isCoalescableCopy(MachineInstr *MI) {
		if (!MI->isCopy())
		return false;

		Register Dst = MI->getOperand(0).getReg();
		Register Src = MI->getOperand(1).getReg();

		if (!MI->isFullCopy()) {
		// If src RC contains super registers of dst RC, it can also be coalesced.
		if (MI->getOperand(0).getSubReg() \|\| Src.isPhysical() \|\| Dst.isPhysical())
		return false;

		auto SrcSub = MI->getOperand(1).getSubReg();
		auto SrcRC = MRI->getRegClass(Src);
		auto DstRC = MRI->getRegClass(Dst);
		return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
		}

		if (Src.isPhysical() && Dst.isPhysical())
		return Src == Dst;

		if (Src.isVirtual() && Dst.isVirtual()) {
		auto SrcRC = MRI->getRegClass(Src);
		auto DstRC = MRI->getRegClass(Dst);
		return SrcRC->hasSuperClassEq(DstRC) \|\| SrcRC->hasSubClassEq(DstRC);
		}

		if (Src.isVirtual())
		std::swap(Src, Dst);

		// Now Src is physical register, Dst is virtual register.
		auto DstRC = MRI->getRegClass(Dst);
		return DstRC->contains(Src);
		}

/// Computes depth of instructions in vector \InsInstr.		/// Computes depth of instructions in vector \InsInstr.
///		///
/// \param InsInstrs is a vector of machine instructions		/// \param InsInstrs is a vector of machine instructions
/// \param InstrIdxForVirtReg is a dense map of virtual register to index		/// \param InstrIdxForVirtReg is a dense map of virtual register to index
/// of defining machine instruction in \p InsInstrs		/// of defining machine instruction in \p InsInstrs
/// \param BlockTrace is a trace of machine instructions		/// \param BlockTrace is a trace of machine instructions
///		///
/// \returns Depth of last instruction in \InsInstrs ("NewRoot")		/// \returns Depth of the first and last instruction in \InsInstrs ("NewRoot")
unsigned		std::pair<unsigned, unsigned>
MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,		MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineTraceMetrics::Trace BlockTrace) {		MachineTraceMetrics::Trace BlockTrace) {
SmallVector<unsigned, 16> InstrDepth;		SmallVector<unsigned, 16> InstrDepth;
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&		assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
"Missing machine model\n");		"Missing machine model\n");

// For each instruction in the new sequence compute the depth based on the		// For each instruction in the new sequence compute the depth based on the
Show All 21 Lines	for (const MachineOperand &MO : InstrPtr->operands()) {
int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());		int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());		int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,		LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
InstrPtr, UseIdx);		InstrPtr, UseIdx);
} else {		} else {
MachineInstr *DefInstr = getOperandDef(MO);		MachineInstr *DefInstr = getOperandDef(MO);
if (DefInstr) {		if (DefInstr) {
DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;		DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
		if (!isCoalescableCopy(DefInstr))
LatencyOp = TSchedModel.computeOperandLatency(		LatencyOp = TSchedModel.computeOperandLatency(
DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),		DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));		InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
}		}
}		}
IDepth = std::max(IDepth, DepthOp + LatencyOp);		IDepth = std::max(IDepth, DepthOp + LatencyOp);
}		}
InstrDepth.push_back(IDepth);		InstrDepth.push_back(IDepth);
}		}
unsigned NewRootIdx = InsInstrs.size() - 1;		unsigned NewRootIdx = InsInstrs.size() - 1;
return InstrDepth[NewRootIdx];		return {InstrDepth[0], InstrDepth[NewRootIdx]};
}		}

/// Computes instruction latency as max of latency of defined operands.		/// Computes instruction latency as max of latency of defined operands.
///		///
/// \param Root is a machine instruction that could be replaced by NewRoot.		/// \param Root is a machine instruction that could be replaced by NewRoot.
/// It is used to compute a more accurate latency information for NewRoot in		/// It is used to compute a more accurate latency information for NewRoot in
/// case there is a dependent instruction in the same trace (\p BlockTrace)		/// case there is a dependent instruction in the same trace (\p BlockTrace)
/// \param NewRoot is the instruction for which the latency is computed		/// \param NewRoot is the instruction for which the latency is computed
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
// MachineCombinerPattern class.		// MachineCombinerPattern class.
switch (P) {		switch (P) {
case MachineCombinerPattern::REASSOC_AX_BY:		case MachineCombinerPattern::REASSOC_AX_BY:
case MachineCombinerPattern::REASSOC_AX_YB:		case MachineCombinerPattern::REASSOC_AX_YB:
case MachineCombinerPattern::REASSOC_XA_BY:		case MachineCombinerPattern::REASSOC_XA_BY:
case MachineCombinerPattern::REASSOC_XA_YB:		case MachineCombinerPattern::REASSOC_XA_YB:
case MachineCombinerPattern::REASSOC_XY_AMM_BMM:		case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:		case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
		case MachineCombinerPattern::SUBADD_OP1:
		case MachineCombinerPattern::SUBADD_OP2:
return CombinerObjective::MustReduceDepth;		return CombinerObjective::MustReduceDepth;
case MachineCombinerPattern::REASSOC_XY_BCA:		case MachineCombinerPattern::REASSOC_XY_BCA:
case MachineCombinerPattern::REASSOC_XY_BAC:		case MachineCombinerPattern::REASSOC_XY_BAC:
return CombinerObjective::MustReduceRegisterPressure;		return CombinerObjective::MustReduceRegisterPressure;
default:		default:
return CombinerObjective::Default;		return CombinerObjective::Default;
}		}
}		}
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	bool MachineCombiner::improvesCriticalPathLen(
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,		SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern,		MachineCombinerPattern Pattern,
bool SlackIsAccurate) {		bool SlackIsAccurate) {
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&		assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
"Missing machine model\n");		"Missing machine model\n");
// Get depth and latency of NewRoot and Root.		// Get depth and latency of NewRoot and Root.
unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);		unsigned NewFirstDepth, NewRootDepth;
		std::tie(NewFirstDepth, NewRootDepth) =
		getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;		unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
		unsigned FirstDepth = BlockTrace.getInstrCycles(*DelInstrs[0]).Depth;

LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "		LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
<< NewRootDepth << "\tRootDepth: " << RootDepth);		<< NewRootDepth << "\tRootDepth: " << RootDepth);

// For a transform such as reassociation, the cost equation is		// For a transform such as reassociation, the cost equation is
// conservatively calculated so that we must improve the depth (data		// conservatively calculated so that we must improve the depth (data
// dependency cycles) in the critical path to proceed with the transform.		// dependency cycles) in the critical path to proceed with the transform.
// Being conservative also protects against inaccuracies in the underlying		// Being conservative also protects against inaccuracies in the underlying
Show All 11 Lines	bool MachineCombiner::improvesCriticalPathLen(
// even if the instruction depths (data dependency cycles) become worse.		// even if the instruction depths (data dependency cycles) become worse.

// Account for the latency of the inserted and deleted instructions by		// Account for the latency of the inserted and deleted instructions by
unsigned NewRootLatency, RootLatency;		unsigned NewRootLatency, RootLatency;
std::tie(NewRootLatency, RootLatency) =		std::tie(NewRootLatency, RootLatency) =
getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);		getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);

unsigned RootSlack = BlockTrace.getInstrSlack(*Root);		unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
unsigned NewCycleCount = NewRootDepth + NewRootLatency;		unsigned NewCycleCount = NewFirstDepth + NewRootLatency;
unsigned OldCycleCount =		unsigned OldCycleCount =
RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);		FirstDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency		LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
<< "\tRootLatency: " << RootLatency << "\n\tRootSlack: "		<< "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
<< RootSlack << " SlackIsAccurate=" << SlackIsAccurate		<< RootSlack << " SlackIsAccurate=" << SlackIsAccurate
<< "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount		<< "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
<< "\n\tRootDepth + RootLatency + RootSlack = "		<< "\n\tRootDepth + RootLatency + RootSlack = "
<< OldCycleCount;);		<< OldCycleCount;);
LLVM_DEBUG(NewCycleCount <= OldCycleCount		LLVM_DEBUG(NewCycleCount <= OldCycleCount
? dbgs() << "\n\t It IMPROVES PathLen because"		? dbgs() << "\n\t It IMPROVES PathLen because"
: dbgs() << "\n\t It DOES NOT improve PathLen because");		: dbgs() << "\n\t It DOES NOT improve PathLen because");
LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount		LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
<< ", OldCycleCount = " << OldCycleCount << "\n");		<< ", OldCycleCount = " << OldCycleCount << "\n");

return NewCycleCount <= OldCycleCount;		return NewCycleCount < OldCycleCount;
}		}

/// helper routine to convert instructions into SC		/// helper routine to convert instructions into SC
void MachineCombiner::instr2instrSC(		void MachineCombiner::instr2instrSC(
SmallVectorImpl<MachineInstr *> &Instrs,		SmallVectorImpl<MachineInstr *> &Instrs,
SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {		SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
for (auto *InstrPtr : Instrs) {		for (auto *InstrPtr : Instrs) {
unsigned Opc = InstrPtr->getOpcode();		unsigned Opc = InstrPtr->getOpcode();
▲ Show 20 Lines • Show All 346 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,821 Lines • ▼ Show 20 Lines	if (CheckZeroReg) {
assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&		assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&		MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");		MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
// The third input reg must be zero.		// The third input reg must be zero.
if (MI->getOperand(3).getReg() != ZeroReg)		if (MI->getOperand(3).getReg() != ZeroReg)
return false;		return false;
}		}

		if (isCombineInstrSettingFlag(CombineOpc) &&
		MI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
		return false;

return true;		return true;
}		}

//		//
// Is \param MO defined by an integer multiply and can be combined?		// Is \param MO defined by an integer multiply and can be combined?
static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,		static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
unsigned MulOpc, unsigned ZeroReg) {		unsigned MulOpc, unsigned ZeroReg) {
return canCombine(MBB, MO, MulOpc, ZeroReg, true);		return canCombine(MBB, MO, MulOpc, ZeroReg, true);
▲ Show 20 Lines • Show All 480 Lines • ▼ Show 20 Lines	bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:		case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:		case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:		case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:		case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
return true;		return true;
} // end switch (Pattern)		} // end switch (Pattern)
return false;		return false;
}		}

		/// Find other MI combine patterns.
		static bool getMiscPatterns(MachineInstr &Root,
		SmallVectorImpl<MachineCombinerPattern> &Patterns)
		{
		// A - (B + C) ==> (A - B) - C or (A - C) - B
		unsigned Opc = Root.getOpcode();
		MachineBasicBlock &MBB = *Root.getParent();

		switch (Opc) {
		case AArch64::SUBWrr:
		case AArch64::SUBSWrr:
		case AArch64::SUBXrr:
		case AArch64::SUBSXrr:
		// Found candidate root.
		break;
		default:
		return false;
		}

		if (isCombineInstrSettingFlag(Opc) &&
		Root.findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
		return false;

		if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) \|\|
		canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) \|\|
		canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) \|\|
		canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
		Patterns.push_back(MachineCombinerPattern::SUBADD_OP1);
		Patterns.push_back(MachineCombinerPattern::SUBADD_OP2);
		return true;
		}

		return false;
		}

/// Return true when there is potentially a faster code sequence for an		/// Return true when there is potentially a faster code sequence for an
/// instruction chain ending in \p Root. All potential patterns are listed in		/// instruction chain ending in \p Root. All potential patterns are listed in
/// the \p Pattern vector. Pattern should be sorted in priority order since the		/// the \p Pattern vector. Pattern should be sorted in priority order since the
/// pattern evaluator stops checking as soon as it finds a faster sequence.		/// pattern evaluator stops checking as soon as it finds a faster sequence.

bool AArch64InstrInfo::getMachineCombinerPatterns(		bool AArch64InstrInfo::getMachineCombinerPatterns(
MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,		MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
bool DoRegPressureReduce) const {		bool DoRegPressureReduce) const {
// Integer patterns		// Integer patterns
if (getMaddPatterns(Root, Patterns))		if (getMaddPatterns(Root, Patterns))
return true;		return true;
// Floating point patterns		// Floating point patterns
if (getFMULPatterns(Root, Patterns))		if (getFMULPatterns(Root, Patterns))
return true;		return true;
if (getFMAPatterns(Root, Patterns))		if (getFMAPatterns(Root, Patterns))
return true;		return true;

		// Other patterns
		if (getMiscPatterns(Root, Patterns))
		return true;

return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,		return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
DoRegPressureReduce);		DoRegPressureReduce);
}		}

enum class FMAInstKind { Default, Indexed, Accumulator };		enum class FMAInstKind { Default, Indexed, Accumulator };
/// genFusedMultiply - Generate fused multiply instructions.		/// genFusedMultiply - Generate fused multiply instructions.
/// This function supports both integer and floating point instructions.		/// This function supports both integer and floating point instructions.
/// A typical example:		/// A typical example:
▲ Show 20 Lines • Show All 231 Lines • ▼ Show 20 Lines	MachineInstrBuilder MIB =
.addReg(SrcReg0, getKillRegState(Src0IsKill))		.addReg(SrcReg0, getKillRegState(Src0IsKill))
.addReg(SrcReg1, getKillRegState(Src1IsKill))		.addReg(SrcReg1, getKillRegState(Src1IsKill))
.addReg(VR);		.addReg(VR);
// Insert the MADD		// Insert the MADD
InsInstrs.push_back(MIB);		InsInstrs.push_back(MIB);
return MUL;		return MUL;
}		}

		/// Do the following transformation
		/// A - (B + C) ==> (A - B) - C
		/// A - (B + C) ==> (A - C) - B
		static void
		genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
		const TargetInstrInfo *TII, MachineInstr &Root,
		SmallVectorImpl<MachineInstr *> &InsInstrs,
		SmallVectorImpl<MachineInstr *> &DelInstrs,
		unsigned IdxOpd1,
		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
		assert(IdxOpd1 == 1 \|\| IdxOpd1 == 2);
		unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
		MachineInstr *AddMI = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());

		Register ResultReg = Root.getOperand(0).getReg();
		Register RegA = Root.getOperand(1).getReg();
		bool RegAIsKill = Root.getOperand(1).isKill();
		Register RegB = AddMI->getOperand(IdxOpd1).getReg();
		bool RegBIsKill = AddMI->getOperand(IdxOpd1).isKill();
		Register RegC = AddMI->getOperand(IdxOtherOpd).getReg();
		bool RegCIsKill = AddMI->getOperand(IdxOtherOpd).isKill();
		Register NewVR = MRI.createVirtualRegister(MRI.getRegClass(RegA));

		unsigned Opcode = Root.getOpcode();
		if (Opcode == AArch64::SUBSWrr)
		Opcode = AArch64::SUBWrr;
		else if (Opcode == AArch64::SUBSXrr)
		Opcode = AArch64::SUBXrr;
		else
		assert((Opcode == AArch64::SUBWrr \|\| Opcode == AArch64::SUBXrr) &&
		"Unexpected instruction opcode.");

		MachineInstrBuilder MIB1 =
		BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), NewVR)
		.addReg(RegA, getKillRegState(RegAIsKill))
		.addReg(RegB, getKillRegState(RegBIsKill));
		MachineInstrBuilder MIB2 =
		BuildMI(MF, Root.getDebugLoc(), TII->get(Opcode), ResultReg)
		.addReg(NewVR, getKillRegState(true))
		.addReg(RegC, getKillRegState(RegCIsKill));

		InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
		InsInstrs.push_back(MIB1);
		InsInstrs.push_back(MIB2);
		DelInstrs.push_back(AddMI);
		}

/// When getMachineCombinerPatterns() finds potential patterns,		/// When getMachineCombinerPatterns() finds potential patterns,
/// this function generates the instructions that could replace the		/// this function generates the instructions that could replace the
/// original code sequence		/// original code sequence
void AArch64InstrInfo::genAlternativeCodeSequence(		void AArch64InstrInfo::genAlternativeCodeSequence(
MachineInstr &Root, MachineCombinerPattern Pattern,		MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,		SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
MachineBasicBlock &MBB = *Root.getParent();		MachineBasicBlock &MBB = *Root.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();		MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
MachineFunction &MF = *MBB.getParent();		MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();		const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();

MachineInstr *MUL = nullptr;		MachineInstr *MUL = nullptr;
const TargetRegisterClass *RC;		const TargetRegisterClass *RC;
unsigned Opc;		unsigned Opc;
switch (Pattern) {		switch (Pattern) {
default:		default:
// Reassociate instructions.		// Reassociate instructions.
TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,		TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
DelInstrs, InstrIdxForVirtReg);		DelInstrs, InstrIdxForVirtReg);
return;		return;
		case MachineCombinerPattern::SUBADD_OP1:
		// A - (B + C)
		// ==> (A - B) - C
		genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 1,
		InstrIdxForVirtReg);
		break;
		case MachineCombinerPattern::SUBADD_OP2:
		// A - (B + C)
		// ==> (A - C) - B
		genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, 2,
		InstrIdxForVirtReg);
		break;
case MachineCombinerPattern::MULADDW_OP1:		case MachineCombinerPattern::MULADDW_OP1:
case MachineCombinerPattern::MULADDX_OP1:		case MachineCombinerPattern::MULADDX_OP1:
// MUL I=A,B,0		// MUL I=A,B,0
// ADD R,I,C		// ADD R,I,C
// ==> MADD R,A,B,C		// ==> MADD R,A,B,C
// --- Create(MADD);		// --- Create(MADD);
if (Pattern == MachineCombinerPattern::MULADDW_OP1) {		if (Pattern == MachineCombinerPattern::MULADDW_OP1) {
Opc = AArch64::MADDWrrr;		Opc = AArch64::MADDWrrr;
▲ Show 20 Lines • Show All 2,383 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir

	# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=UNPROFITABLE,ALL %s			# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=UNPROFITABLE %s
	# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true \| FileCheck --check-prefixes=PROFITABLE,ALL %s			# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true \| FileCheck --check-prefixes=PROFITABLE %s
	# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=PROFITABLE,ALL %s			# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m3 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=EXYNOS %s
	# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=PROFITABLE,ALL %s			# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=THUNDER2 %s
	# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=PROFITABLE,ALL %s			# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx3t110 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s \| FileCheck --check-prefixes=THUNDER3 %s
	#			#
	name: f1_2s			name: f1_2s
	registers:			registers:
	- { id: 0, class: fpr64 }			- { id: 0, class: fpr64 }
	- { id: 1, class: fpr64 }			- { id: 1, class: fpr64 }
	- { id: 2, class: fpr64 }			- { id: 2, class: fpr64 }
	- { id: 3, class: fpr64 }			- { id: 3, class: fpr64 }
	- { id: 4, class: fpr64 }			- { id: 4, class: fpr64 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%2:fpr64 = COPY $d2			%2:fpr64 = COPY $d2
	%1:fpr64 = COPY $d1			%1:fpr64 = COPY $d1
	%0:fpr64 = COPY $d0			%0:fpr64 = COPY $d0
	%3:fpr64 = FMULv2f32 %0, %1			%3:fpr64 = FMULv2f32 %0, %1
	%4:fpr64 = FSUBv2f32 killed %3, %2			%4:fpr64 = FSUBv2f32 killed %3, %2
	$d0 = COPY %4			$d0 = COPY %4
	RET_ReallyLR implicit $d0			RET_ReallyLR implicit $d0

	...			...
	# UNPROFITABLE-LABEL: name: f1_2s			# UNPROFITABLE-LABEL: name: f1_2s
	# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1			# UNPROFITABLE: %3:fpr64 = FMULv2f32 %0, %1
	# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2			# UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
	#			#
				# THUNDER2-LABEL: name: f1_2s
				# THUNDER2: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
				# THUNDER2-NEXT: FMLAv2f32 killed [[R1]], %0, %1
				#
				# THUNDER3-LABEL: name: f1_2s
				# THUNDER3: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
				# THUNDER3-NEXT: FMLAv2f32 killed [[R1]], %0, %1
				#
	# PROFITABLE-LABEL: name: f1_2s			# PROFITABLE-LABEL: name: f1_2s
	# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2			# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
	# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1			# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1
				#
				# EXYNOS-LABEL: name: f1_2s
				# EXYNOS: %3:fpr64 = FMULv2f32 %0, %1
				# EXYNOS-NEXT: FSUBv2f32 killed %3, %
	---			---
	name: f1_4s			name: f1_4s
	registers:			registers:
	- { id: 0, class: fpr128 }			- { id: 0, class: fpr128 }
	- { id: 1, class: fpr128 }			- { id: 1, class: fpr128 }
	- { id: 2, class: fpr128 }			- { id: 2, class: fpr128 }
	- { id: 3, class: fpr128 }			- { id: 3, class: fpr128 }
	- { id: 4, class: fpr128 }			- { id: 4, class: fpr128 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%2:fpr128 = COPY $q2			%2:fpr128 = COPY $q2
	%1:fpr128 = COPY $q1			%1:fpr128 = COPY $q1
	%0:fpr128 = COPY $q0			%0:fpr128 = COPY $q0
	%3:fpr128 = FMULv4f32 %0, %1			%3:fpr128 = FMULv4f32 %0, %1
	%4:fpr128 = FSUBv4f32 killed %3, %2			%4:fpr128 = FSUBv4f32 killed %3, %2
	$q0 = COPY %4			$q0 = COPY %4
	RET_ReallyLR implicit $q0			RET_ReallyLR implicit $q0

	...			...
	# UNPROFITABLE-LABEL: name: f1_4s			# UNPROFITABLE-LABEL: name: f1_4s
	# UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1			# UNPROFITABLE: %3:fpr128 = FMULv4f32 %0, %1
	# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2			# UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
	#			#
				# THUNDER2-LABEL: name: f1_4s
				# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
				# THUNDER2-NEXT: FMLAv4f32 killed [[R1]], %0, %1
				#
				# THUNDER3-LABEL: name: f1_4s
				# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
				# THUNDER3-NEXT: FMLAv4f32 killed [[R1]], %0, %1
				#
	# PROFITABLE-LABEL: name: f1_4s			# PROFITABLE-LABEL: name: f1_4s
	# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2			# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
	# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1			# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1
				#
				# EXYNOS-LABEL: name: f1_4s
				# EXYNOS: %3:fpr128 = FMULv4f32 %0, %1
				# EXYNOS-NEXT: FSUBv4f32 killed %3, %2
	---			---
	name: f1_2d			name: f1_2d
	registers:			registers:
	- { id: 0, class: fpr128 }			- { id: 0, class: fpr128 }
	- { id: 1, class: fpr128 }			- { id: 1, class: fpr128 }
	- { id: 2, class: fpr128 }			- { id: 2, class: fpr128 }
	- { id: 3, class: fpr128 }			- { id: 3, class: fpr128 }
	- { id: 4, class: fpr128 }			- { id: 4, class: fpr128 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%2:fpr128 = COPY $q2			%2:fpr128 = COPY $q2
	%1:fpr128 = COPY $q1			%1:fpr128 = COPY $q1
	%0:fpr128 = COPY $q0			%0:fpr128 = COPY $q0
	%3:fpr128 = FMULv2f64 %0, %1			%3:fpr128 = FMULv2f64 %0, %1
	%4:fpr128 = FSUBv2f64 killed %3, %2			%4:fpr128 = FSUBv2f64 killed %3, %2
	$q0 = COPY %4			$q0 = COPY %4
	RET_ReallyLR implicit $q0			RET_ReallyLR implicit $q0

	...			...
	# UNPROFITABLE-LABEL: name: f1_2d			# UNPROFITABLE-LABEL: name: f1_2d
	# UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1			# UNPROFITABLE: %3:fpr128 = FMULv2f64 %0, %1
	# UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2			# UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2
	#			#
				# THUNDER2-LABEL: name: f1_2d
				# THUNDER2: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
				# THUNDER2-NEXT: FMLAv2f64 killed [[R1]], %0, %1
				#
				# THUNDER3-LABEL: name: f1_2d
				# THUNDER3: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
				# THUNDER3-NEXT: FMLAv2f64 killed [[R1]], %0, %1
				#
	# PROFITABLE-LABEL: name: f1_2d			# PROFITABLE-LABEL: name: f1_2d
	# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2			# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
	# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1			# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1
				#
				# EXYNOS-LABEL: name: f1_2d
				# EXYNOS: %3:fpr128 = FMULv2f64 %0, %1
				# EXYNOS-NEXT: FSUBv2f64 killed %3, %2
	---			---
	name: f1_both_fmul_2s			name: f1_both_fmul_2s
	registers:			registers:
	- { id: 0, class: fpr64 }			- { id: 0, class: fpr64 }
	- { id: 1, class: fpr64 }			- { id: 1, class: fpr64 }
	- { id: 2, class: fpr64 }			- { id: 2, class: fpr64 }
	- { id: 3, class: fpr64 }			- { id: 3, class: fpr64 }
	- { id: 4, class: fpr64 }			- { id: 4, class: fpr64 }
	- { id: 5, class: fpr64 }			- { id: 5, class: fpr64 }
	- { id: 6, class: fpr64 }			- { id: 6, class: fpr64 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%3:fpr64 = COPY $q3			%3:fpr64 = COPY $q3
	%2:fpr64 = COPY $q2			%2:fpr64 = COPY $q2
	%1:fpr64 = COPY $q1			%1:fpr64 = COPY $q1
	%0:fpr64 = COPY $q0			%0:fpr64 = COPY $q0
	%4:fpr64 = FMULv2f32 %0, %1			%4:fpr64 = FMULv2f32 %0, %1
	%5:fpr64 = FMULv2f32 %2, %3			%5:fpr64 = FMULv2f32 %2, %3
	%6:fpr64 = FSUBv2f32 killed %4, %5			%6:fpr64 = FSUBv2f32 killed %4, %5
	$q0 = COPY %6			$q0 = COPY %6
	RET_ReallyLR implicit $q0			RET_ReallyLR implicit $q0

	...			...
	# ALL-LABEL: name: f1_both_fmul_2s			# UNPROFITABLE-LABEL: name: f1_both_fmul_2s
	# ALL: %4:fpr64 = FMULv2f32 %0, %1			# UNPROFITABLE: %4:fpr64 = FMULv2f32 %0, %1
	# ALL-NEXT: FMLSv2f32 killed %4, %2, %3			# UNPROFITABLE-NEXT: %5:fpr64 = FMULv2f32 %2, %3
				# UNPROFITABLE-NEXT: FSUBv2f32 killed %4, %5
				#
				# THUNDER2-LABEL: name: f1_both_fmul_2s
				# THUNDER2: %4:fpr64 = FMULv2f32 %0, %1
				# THUNDER2-NEXT: %5:fpr64 = FMULv2f32 %2, %3
				# THUNDER2-NEXT: FSUBv2f32 killed %4, %5
				#
				# THUNDER3-LABEL: name: f1_both_fmul_2s
				# THUNDER3: %4:fpr64 = FMULv2f32 %0, %1
				# THUNDER3-NEXT: %6:fpr64 = FMLSv2f32 killed %4, %2, %3
				#
				# PROFITABLE-LABEL: name: f1_both_fmul_2s
				# PROFITABLE: %4:fpr64 = FMULv2f32 %0, %1
				# PROFITABLE-NEXT: %5:fpr64 = FMULv2f32 %2, %3
				# PROFITABLE-NEXT: FSUBv2f32 killed %4, %5
				#
				# EXYNOS-LABEL: name: f1_both_fmul_2s
				# EXYNOS: %4:fpr64 = FMULv2f32 %0, %1
				# EXYNOS-NEXT: %5:fpr64 = FMULv2f32 %2, %3
				# EXYNOS-NEXT: FSUBv2f32 killed %4, %5
	---			---
	name: f1_both_fmul_4s			name: f1_both_fmul_4s
	registers:			registers:
	- { id: 0, class: fpr128 }			- { id: 0, class: fpr128 }
	- { id: 1, class: fpr128 }			- { id: 1, class: fpr128 }
	- { id: 2, class: fpr128 }			- { id: 2, class: fpr128 }
	- { id: 3, class: fpr128 }			- { id: 3, class: fpr128 }
	- { id: 4, class: fpr128 }			- { id: 4, class: fpr128 }
	- { id: 5, class: fpr128 }			- { id: 5, class: fpr128 }
	- { id: 6, class: fpr128 }			- { id: 6, class: fpr128 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%3:fpr128 = COPY $q3			%3:fpr128 = COPY $q3
	%2:fpr128 = COPY $q2			%2:fpr128 = COPY $q2
	%1:fpr128 = COPY $q1			%1:fpr128 = COPY $q1
	%0:fpr128 = COPY $q0			%0:fpr128 = COPY $q0
	%4:fpr128 = FMULv4f32 %0, %1			%4:fpr128 = FMULv4f32 %0, %1
	%5:fpr128 = FMULv4f32 %2, %3			%5:fpr128 = FMULv4f32 %2, %3
	%6:fpr128 = FSUBv4f32 killed %4, %5			%6:fpr128 = FSUBv4f32 killed %4, %5
	$q0 = COPY %6			$q0 = COPY %6
	RET_ReallyLR implicit $q0			RET_ReallyLR implicit $q0

	...			...
	# ALL-LABEL: name: f1_both_fmul_4s			# UNPROFITABLE-LABEL: name: f1_both_fmul_4s
	# ALL: %4:fpr128 = FMULv4f32 %0, %1			# UNPROFITABLE: %4:fpr128 = FMULv4f32 %0, %1
	# ALL-NEXT: FMLSv4f32 killed %4, %2, %3			# UNPROFITABLE-NEXT: %5:fpr128 = FMULv4f32 %2, %3
				# UNPROFITABLE-NEXT: FSUBv4f32 killed %4, %5
				#
				# THUNDER2-LABEL: name: f1_both_fmul_4s
				# THUNDER2: %4:fpr128 = FMULv4f32 %0, %1
				# THUNDER2-NEXT: %5:fpr128 = FMULv4f32 %2, %3
				# THUNDER2-NEXT: FSUBv4f32 killed %4, %5
				#
				# THUNDER3-LABEL: name: f1_both_fmul_4s
				# THUNDER3: %4:fpr128 = FMULv4f32 %0, %1
				# THUNDER3-NEXT: %6:fpr128 = FMLSv4f32 killed %4, %2, %3
				#
				# PROFITABLE-LABEL: name: f1_both_fmul_4s
				# PROFITABLE: %4:fpr128 = FMULv4f32 %0, %1
				# PROFITABLE-NEXT: %5:fpr128 = FMULv4f32 %2, %3
				# PROFITABLE-NEXT: FSUBv4f32 killed %4, %5
				#
				# EXYNOS-LABEL: name: f1_both_fmul_4s
				# EXYNOS: %4:fpr128 = FMULv4f32 %0, %1
				# EXYNOS-NEXT: %5:fpr128 = FMULv4f32 %2, %3
				# EXYNOS-NEXT: FSUBv4f32 killed %4, %5
	---			---
	name: f1_both_fmul_2d			name: f1_both_fmul_2d
	registers:			registers:
	- { id: 0, class: fpr128 }			- { id: 0, class: fpr128 }
	- { id: 1, class: fpr128 }			- { id: 1, class: fpr128 }
	- { id: 2, class: fpr128 }			- { id: 2, class: fpr128 }
	- { id: 3, class: fpr128 }			- { id: 3, class: fpr128 }
	- { id: 4, class: fpr128 }			- { id: 4, class: fpr128 }
	- { id: 5, class: fpr128 }			- { id: 5, class: fpr128 }
	- { id: 6, class: fpr128 }			- { id: 6, class: fpr128 }
	body: \|			body: \|
	bb.0.entry:			bb.0.entry:
	%3:fpr128 = COPY $q3			%3:fpr128 = COPY $q3
	%2:fpr128 = COPY $q2			%2:fpr128 = COPY $q2
	%1:fpr128 = COPY $q1			%1:fpr128 = COPY $q1
	%0:fpr128 = COPY $q0			%0:fpr128 = COPY $q0
	%4:fpr128 = FMULv2f64 %0, %1			%4:fpr128 = FMULv2f64 %0, %1
	%5:fpr128 = FMULv2f64 %2, %3			%5:fpr128 = FMULv2f64 %2, %3
	%6:fpr128 = FSUBv2f64 killed %4, %5			%6:fpr128 = FSUBv2f64 killed %4, %5
	$q0 = COPY %6			$q0 = COPY %6
	RET_ReallyLR implicit $q0			RET_ReallyLR implicit $q0

	...			...
	# ALL-LABEL: name: f1_both_fmul_2d			# UNPROFITABLE-LABEL: name: f1_both_fmul_2d
	# ALL: %4:fpr128 = FMULv2f64 %0, %1			# UNPROFITABLE: %4:fpr128 = FMULv2f64 %0, %1
	# ALL-NEXT: FMLSv2f64 killed %4, %2, %3			# UNPROFITABLE-NEXT: %5:fpr128 = FMULv2f64 %2, %3
				# UNPROFITABLE-NEXT: FSUBv2f64 killed %4, %5
				#
				# THUNDER2-LABEL: name: f1_both_fmul_2d
				# THUNDER2: %4:fpr128 = FMULv2f64 %0, %1
				# THUNDER2-NEXT: %5:fpr128 = FMULv2f64 %2, %3
				# THUNDER2-NEXT: FSUBv2f64 killed %4, %5
				#
				# THUNDER3-LABEL: name: f1_both_fmul_2d
				# THUNDER3: %4:fpr128 = FMULv2f64 %0, %1
				# THUNDER3-NEXT: %6:fpr128 = FMLSv2f64 killed %4, %2, %3
				#
				# PROFITABLE-LABEL: name: f1_both_fmul_2d
				# PROFITABLE: %4:fpr128 = FMULv2f64 %0, %1
				# PROFITABLE-NEXT: %5:fpr128 = FMULv2f64 %2, %3
				# PROFITABLE-NEXT: FSUBv2f64 killed %4, %5
				#
				# EXYNOS-LABEL: name: f1_both_fmul_2d
				# EXYNOS: %4:fpr128 = FMULv2f64 %0, %1
				# EXYNOS-NEXT: %5:fpr128 = FMULv2f64 %2, %3
				# EXYNOS-NEXT: FSUBv2f64 killed %4, %5

llvm/test/CodeGen/AArch64/addsub_ext.ll

Show First 20 Lines • Show All 564 Lines • ▼ Show 20 Lines	entry:
ret i1 %ret		ret i1 %ret
}		}

; Check that implicit zext from w reg write is used instead of uxtw		; Check that implicit zext from w reg write is used instead of uxtw
; form of add, leading to madd selection.		; form of add, leading to madd selection.
define dso_local i64 @madd_fold_uxtw(i32 %x, i64 %y) {		define dso_local i64 @madd_fold_uxtw(i32 %x, i64 %y) {
; CHECK-LABEL: madd_fold_uxtw:		; CHECK-LABEL: madd_fold_uxtw:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and w8, w0, #0x3		; CHECK-NEXT: mul x8, x1, x1
; CHECK-NEXT: madd x0, x1, x1, x8		; CHECK-NEXT: and w9, w0, #0x3
		; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%m = and i32 %x, 3		%m = and i32 %x, 3
%ext = zext i32 %m to i64		%ext = zext i32 %m to i64
%mul = mul i64 %y, %y		%mul = mul i64 %y, %y
%ret = add i64 %mul, %ext		%ret = add i64 %mul, %ext
ret i64 %ret		ret i64 %ret
}		}
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

	; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs \| FileCheck %s			; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs \| FileCheck %s

	define void @foo_2d(double* %src) {			define void @foo_2d(double* %src) {
	; CHECK-LABEL: %entry			; CHECK-LABEL: %entry
	; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}			; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
	; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}			; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
				; CHECK-NEXT: fadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
	entry:			entry:
	%arrayidx1 = getelementptr inbounds double, double* %src, i64 5			%arrayidx1 = getelementptr inbounds double, double* %src, i64 5
	%arrayidx2 = getelementptr inbounds double, double* %src, i64 11			%arrayidx2 = getelementptr inbounds double, double* %src, i64 11
	%tmp = bitcast double* %arrayidx1 to <2 x double>*			%tmp = bitcast double* %arrayidx1 to <2 x double>*
	%tmp1 = load double, double* %arrayidx2, align 8			%tmp1 = load double, double* %arrayidx2, align 8
	%tmp2 = load double, double* %arrayidx1, align 8			%tmp2 = load double, double* %arrayidx1, align 8
	%fmul = fmul fast double %tmp1, %tmp1			%fmul = fmul fast double %tmp1, %tmp1
	%fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B			%fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
	▲ Show 20 Lines • Show All 248 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/fadd-combines.ll

	Show First 20 Lines • Show All 224 Lines • ▼ Show 20 Lines
	}			}

	; Not minimum FMF.			; Not minimum FMF.

	define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) nounwind {			define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) nounwind {
	; CHECK-LABEL: fadd_fma_fmul_2:			; CHECK-LABEL: fadd_fma_fmul_2:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: fmul s2, s2, s3			; CHECK-NEXT: fmul s2, s2, s3
	; CHECK-NEXT: fmadd s0, s0, s1, s2			; CHECK-NEXT: fmul s0, s0, s1
				; CHECK-NEXT: fadd s0, s0, s2
	; CHECK-NEXT: fadd s0, s4, s0			; CHECK-NEXT: fadd s0, s4, s0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%m1 = fmul float %a, %b			%m1 = fmul float %a, %b
	%m2 = fmul float %c, %d			%m2 = fmul float %c, %d
	%a1 = fadd contract float %m1, %m2			%a1 = fadd contract float %m1, %m2
	%a2 = fadd contract float %n0, %a1			%a2 = fadd contract float %n0, %a1
	ret float %a2			ret float %a2
	}			}
	▲ Show 20 Lines • Show All 78 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/i128-math.ll

Show First 20 Lines • Show All 301 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%1 = tail call i128 @llvm.ssub.sat.i128(i128 %x, i128 %y)		%1 = tail call i128 @llvm.ssub.sat.i128(i128 %x, i128 %y)
ret i128 %1		ret i128 %1
}		}

define i128 @u128_mul(i128 %x, i128 %y) {		define i128 @u128_mul(i128 %x, i128 %y) {
; CHECK-LABEL: u128_mul:		; CHECK-LABEL: u128_mul:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umulh x8, x0, x2		; CHECK-NEXT: umulh x8, x0, x2
		; CHECK-NEXT: mul x9, x1, x2
; CHECK-NEXT: madd x8, x0, x3, x8		; CHECK-NEXT: madd x8, x0, x3, x8
; CHECK-NEXT: mul x0, x0, x2		; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: madd x1, x1, x2, x8		; CHECK-NEXT: add x1, x8, x9
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = mul i128 %x, %y		%1 = mul i128 %x, %y
ret i128 %1		ret i128 %1
}		}

define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {		define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: u128_checked_mul:		; CHECK-LABEL: u128_checked_mul:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%4 = select i1 %3, i128 -1, i128 %2		%4 = select i1 %3, i128 -1, i128 %2
ret i128 %4		ret i128 %4
}		}

define i128 @i128_mul(i128 %x, i128 %y) {		define i128 @i128_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_mul:		; CHECK-LABEL: i128_mul:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umulh x8, x0, x2		; CHECK-NEXT: umulh x8, x0, x2
		; CHECK-NEXT: mul x9, x1, x2
; CHECK-NEXT: madd x8, x0, x3, x8		; CHECK-NEXT: madd x8, x0, x3, x8
; CHECK-NEXT: mul x0, x0, x2		; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: madd x1, x1, x2, x8		; CHECK-NEXT: add x1, x8, x9
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = mul i128 %x, %y		%1 = mul i128 %x, %y
ret i128 %1		ret i128 %1
}		}

define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {		define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_checked_mul:		; CHECK-LABEL: i128_checked_mul:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
▲ Show 20 Lines • Show All 75 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/machine-combiner-madd.ll

	; Test all AArch64 subarches with scheduling models.			; Test all AArch64 subarches with scheduling models.
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=a64fx < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=a64fx < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cyclone < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx3t110 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx3t110 < %s \| FileCheck %s
	; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=tsv110 < %s \| FileCheck %s			; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=tsv110 < %s \| FileCheck %s

	; Make sure that inst-combine fuses the multiply add in the addressing mode of			; Make sure that machine combiner doesn't fuse the multiply add because the
	; the load.			; latency of max(mul, load)+add is shorter than load+madd.

	; CHECK-LABEL: fun:			; CHECK-LABEL: fun:
	; CHECK-NOT: mul			; CHECK: mul
	; CHECK: madd			; CHECK-NOT: madd
	; CHECK-NOT: mul

	%class.D = type { %class.basic_string.base, [4 x i8] }			%class.D = type { %class.basic_string.base, [4 x i8] }
	%class.basic_string.base = type <{ i64, i64, i32 }>			%class.basic_string.base = type <{ i64, i64, i32 }>
	@a = global %class.D* zeroinitializer, align 8			@a = global %class.D* zeroinitializer, align 8
	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
	define internal void @fun() section ".text.startup" {			define internal void @fun() section ".text.startup" {
	entry:			entry:
	%tmp.i.i = alloca %class.D, align 8			%tmp.i.i = alloca %class.D, align 8
	Show All 15 Lines

llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - \| FileCheck %s

				; The test cases in this file check following transformation if the right form
				; can reduce latency.
				; A - (B + C) ==> (A - B) - C

				; 32 bit version.
				define i32 @test1(i32 %a, i32 %b, i32 %c) {
				; CHECK-LABEL: test1:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: sub w8, w2, w0
				; CHECK-NEXT: eor w9, w1, w0, lsl #8
				; CHECK-NEXT: sub w8, w8, w9
				; CHECK-NEXT: eor w0, w8, w9, asr #13
				; CHECK-NEXT: ret
				entry:
				%shl = shl i32 %a, 8
				%xor = xor i32 %shl, %b
				%add = add i32 %xor, %a
				%sub = sub i32 %c, %add
				%shr = ashr i32 %xor, 13
				%xor2 = xor i32 %sub, %shr
				ret i32 %xor2
				}

				; 64 bit version.
				define i64 @test2(i64 %a, i64 %b, i64 %c) {
				; CHECK-LABEL: test2:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: sub x8, x2, x0
				; CHECK-NEXT: eor x9, x1, x0, lsl #8
				; CHECK-NEXT: sub x8, x8, x9
				; CHECK-NEXT: eor x0, x8, x9, asr #13
				; CHECK-NEXT: ret
				entry:
				%shl = shl i64 %a, 8
				%xor = xor i64 %shl, %b
				%add = add i64 %xor, %a
				%sub = sub i64 %c, %add
				%shr = ashr i64 %xor, 13
				%xor2 = xor i64 %sub, %shr
				ret i64 %xor2
				}

				; Negative test. The right form can't reduce latency.
				define i32 @test3(i32 %a, i32 %b, i32 %c) {
				; CHECK-LABEL: test3:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: add w8, w2, w0
				; CHECK-NEXT: eor w9, w1, w0, lsl #8
				; CHECK-NEXT: sub w8, w9, w8
				; CHECK-NEXT: eor w0, w8, w9, asr #13
				; CHECK-NEXT: ret
				entry:
				%shl = shl i32 %a, 8
				%xor = xor i32 %shl, %b
				%add = add i32 %c, %a
				%sub = sub i32 %xor, %add
				%shr = ashr i32 %xor, 13
				%xor2 = xor i32 %sub, %shr
				ret i32 %xor2
				}

llvm/test/CodeGen/AArch64/madd-combiner.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-ISEL			; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-ISEL
	; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-FAST			; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s \| FileCheck %s --check-prefixes=CHECK,CHECK-FAST

	; Test that we use the correct register class.			; Test that we use the correct register class.
	define i32 @mul_add_imm(i32 %a, i32 %b) {			define i32 @mul_add_imm(i32 %a, i32 %b) {
	; CHECK-LABEL: mul_add_imm:			; CHECK-LABEL: mul_add_imm:
	; CHECK: ; %bb.0:			; CHECK: ; %bb.0:
	; CHECK-NEXT: orr w8, wzr, #0x4			; CHECK-NEXT: mul w8, w0, w1
	; CHECK-NEXT: madd w0, w0, w1, w8			; CHECK-NEXT: add w0, w8, #4
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%1 = mul i32 %a, %b			%1 = mul i32 %a, %b
	%2 = add i32 %1, 4			%2 = add i32 %1, 4
	ret i32 %2			ret i32 %2
	}			}

	define i32 @mul_sub_imm1(i32 %a, i32 %b) {			define i32 @mul_sub_imm1(i32 %a, i32 %b) {
	; CHECK-LABEL: mul_sub_imm1:			; CHECK-LABEL: mul_sub_imm1:
	▲ Show 20 Lines • Show All 185 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/madd-lohi.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - \| FileCheck %s			; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - \| FileCheck %s
	; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - \| FileCheck --check-prefix=CHECK-BE %s			; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - \| FileCheck --check-prefix=CHECK-BE %s

	define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {			define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
	; CHECK-LABEL: test_128bitmul:			; CHECK-LABEL: test_128bitmul:
	; CHECK: ; %bb.0:			; CHECK: ; %bb.0:
	; CHECK-NEXT: umulh x8, x0, x2			; CHECK-NEXT: umulh x8, x0, x2
				; CHECK-NEXT: mul x9, x1, x2
	; CHECK-NEXT: madd x8, x0, x3, x8			; CHECK-NEXT: madd x8, x0, x3, x8
	; CHECK-NEXT: mul x0, x0, x2			; CHECK-NEXT: mul x0, x0, x2
	; CHECK-NEXT: madd x1, x1, x2, x8			; CHECK-NEXT: add x1, x8, x9
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	;			;
	; CHECK-BE-LABEL: test_128bitmul:			; CHECK-BE-LABEL: test_128bitmul:
	; CHECK-BE: // %bb.0:			; CHECK-BE: // %bb.0:
	; CHECK-BE-NEXT: umulh x8, x1, x3			; CHECK-BE-NEXT: umulh x8, x1, x3
				; CHECK-BE-NEXT: mul x9, x0, x3
	; CHECK-BE-NEXT: madd x8, x1, x2, x8			; CHECK-BE-NEXT: madd x8, x1, x2, x8
	; CHECK-BE-NEXT: mul x1, x1, x3			; CHECK-BE-NEXT: mul x1, x1, x3
	; CHECK-BE-NEXT: madd x0, x0, x3, x8			; CHECK-BE-NEXT: add x0, x8, x9
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret


	%prod = mul i128 %lhs, %rhs			%prod = mul i128 %lhs, %rhs
	ret i128 %prod			ret i128 %prod
	}			}

llvm/test/CodeGen/AArch64/mul-lohi.ll

	; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - \| FileCheck %s			; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - \| FileCheck %s
	; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - \| FileCheck --check-prefix=CHECK-BE %s			; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - \| FileCheck --check-prefix=CHECK-BE %s

	define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {			define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
	; CHECK-LABEL: test_128bitmul:			; CHECK-LABEL: test_128bitmul:
				; CHECK: mul [[TEMP0:x[0-9]+]], x0, x3
	; CHECK: umulh [[HI:x[0-9]+]], x0, x2			; CHECK: umulh [[HI:x[0-9]+]], x0, x2
	; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]			; CHECK: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]]
	; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]			; CHECK: mul [[TEMP2:x[0-9]+]], x1, x2
				; CHECK-DAG: add x1, [[TEMP1]], [[TEMP2]]
	; CHECK-DAG: mul x0, x0, x2			; CHECK-DAG: mul x0, x0, x2
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	; CHECK-BE-LABEL: test_128bitmul:			; CHECK-BE-LABEL: test_128bitmul:
				; CHECK-BE: mul [[TEMP0:x[0-9]+]], x1, x2
	; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3			; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3
	; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]]			; CHECK-BE: add [[TEMP1:x[0-9]+]], [[HI]], [[TEMP0]]
	; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]]			; CHECK-BE: mul [[TEMP2:x[0-9]+]], x0, x3
				; CHECK-BE-DAG: add x0, [[TEMP1]], [[TEMP2]]
	; CHECK-BE-DAG: mul x1, x1, x3			; CHECK-BE-DAG: mul x1, x1, x3
	; CHECK-BE-NEXT: ret			; CHECK-BE-NEXT: ret

	%prod = mul i128 %lhs, %rhs			%prod = mul i128 %lhs, %rhs
	ret i128 %prod			ret i128 %prod
	}			}

	; The machine combiner should create madd instructions when			; The machine combiner should create madd instructions when
	Show All 26 Lines

llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll

	Show First 20 Lines • Show All 348 Lines • ▼ Show 20 Lines
	; One INT_MIN divisor in odd divisor			; One INT_MIN divisor in odd divisor
	define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {			define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
	; CHECK-LABEL: test_srem_odd_INT_MIN:			; CHECK-LABEL: test_srem_odd_INT_MIN:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI13_0			; CHECK-NEXT: adrp x8, .LCPI13_0
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0]
	; CHECK-NEXT: adrp x8, .LCPI13_1			; CHECK-NEXT: adrp x8, .LCPI13_1
	; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s			; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_1]
	; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s			; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
	; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1]
	; CHECK-NEXT: adrp x8, .LCPI13_2			; CHECK-NEXT: adrp x8, .LCPI13_2
	; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s			; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_2]			; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2]
	; CHECK-NEXT: adrp x8, .LCPI13_3			; CHECK-NEXT: adrp x8, .LCPI13_3
	; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s			; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
				; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s
	; CHECK-NEXT: usra v2.4s, v1.4s, #31			; CHECK-NEXT: usra v2.4s, v1.4s, #31
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3]
	; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s			; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
	; CHECK-NEXT: movi v1.4s, #1			; CHECK-NEXT: movi v1.4s, #1
	; CHECK-NEXT: cmeq v0.4s, v0.4s, #0			; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
	; CHECK-NEXT: and v0.16b, v0.16b, v1.16b			; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5>			%srem = srem <4 x i32> %X, <i32 5, i32 5, i32 2147483648, i32 5>
	%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>			%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
	%ret = zext <4 x i1> %cmp to <4 x i32>			%ret = zext <4 x i1> %cmp to <4 x i32>
	ret <4 x i32> %ret			ret <4 x i32> %ret
	}			}

	; One INT_MIN divisor in even divisor			; One INT_MIN divisor in even divisor
	define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {			define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
	; CHECK-LABEL: test_srem_even_INT_MIN:			; CHECK-LABEL: test_srem_even_INT_MIN:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI14_0			; CHECK-NEXT: adrp x8, .LCPI14_0
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
	; CHECK-NEXT: adrp x8, .LCPI14_1			; CHECK-NEXT: adrp x8, .LCPI14_1
	; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s			; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_1]
	; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s			; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
	; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1]
	; CHECK-NEXT: adrp x8, .LCPI14_2			; CHECK-NEXT: adrp x8, .LCPI14_2
	; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s			; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_2]			; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2]
	; CHECK-NEXT: adrp x8, .LCPI14_3			; CHECK-NEXT: adrp x8, .LCPI14_3
	; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s			; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
				; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s
	; CHECK-NEXT: usra v2.4s, v1.4s, #31			; CHECK-NEXT: usra v2.4s, v1.4s, #31
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3]
	; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s			; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
	; CHECK-NEXT: movi v1.4s, #1			; CHECK-NEXT: movi v1.4s, #1
	; CHECK-NEXT: cmeq v0.4s, v0.4s, #0			; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
	; CHECK-NEXT: and v0.16b, v0.16b, v1.16b			; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>			%srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
	%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>			%cmp = icmp eq <4 x i32> %srem, <i32 0, i32 0, i32 0, i32 0>
	%ret = zext <4 x i1> %cmp to <4 x i32>			%ret = zext <4 x i1> %cmp to <4 x i32>
	ret <4 x i32> %ret			ret <4 x i32> %ret
	}			}

	; One INT_MIN divisor in odd+even divisor			; One INT_MIN divisor in odd+even divisor
	define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {			define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
	; CHECK-LABEL: test_srem_odd_even_INT_MIN:			; CHECK-LABEL: test_srem_odd_even_INT_MIN:
	; CHECK: // %bb.0:			; CHECK: // %bb.0:
	; CHECK-NEXT: adrp x8, .LCPI15_0			; CHECK-NEXT: adrp x8, .LCPI15_0
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
	; CHECK-NEXT: adrp x8, .LCPI15_1			; CHECK-NEXT: adrp x8, .LCPI15_1
	; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s			; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_1]
	; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s			; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s
	; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1]
	; CHECK-NEXT: adrp x8, .LCPI15_2			; CHECK-NEXT: adrp x8, .LCPI15_2
	; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s			; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
	; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_2]			; CHECK-NEXT: mul v2.4s, v0.4s, v3.4s
				; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2]
	; CHECK-NEXT: adrp x8, .LCPI15_3			; CHECK-NEXT: adrp x8, .LCPI15_3
	; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s			; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
				; CHECK-NEXT: sshl v2.4s, v1.4s, v3.4s
	; CHECK-NEXT: usra v2.4s, v1.4s, #31			; CHECK-NEXT: usra v2.4s, v1.4s, #31
	; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3]			; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3]
	; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s			; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
	; CHECK-NEXT: movi v1.4s, #1			; CHECK-NEXT: movi v1.4s, #1
	; CHECK-NEXT: cmeq v0.4s, v0.4s, #0			; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
	; CHECK-NEXT: and v0.16b, v0.16b, v1.16b			; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100>			%srem = srem <4 x i32> %X, <i32 5, i32 14, i32 2147483648, i32 100>
	▲ Show 20 Lines • Show All 311 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[MachineCombiner, AArch64] Add a new pattern A-(B+C) => (A-B)-C to reduce latency
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 425626

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

llvm/lib/CodeGen/MachineCombiner.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir

llvm/test/CodeGen/AArch64/addsub_ext.ll

llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

llvm/test/CodeGen/AArch64/fadd-combines.ll

llvm/test/CodeGen/AArch64/i128-math.ll

llvm/test/CodeGen/AArch64/machine-combiner-madd.ll

llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll

llvm/test/CodeGen/AArch64/madd-combiner.ll

llvm/test/CodeGen/AArch64/madd-lohi.ll

llvm/test/CodeGen/AArch64/mul-lohi.ll

llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll

This is an archive of the discontinued LLVM Phabricator instance.

[MachineCombiner, AArch64] Add a new pattern A-(B+C) => (A-B)-C to reduce latencyClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 425626

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

llvm/lib/CodeGen/MachineCombiner.cpp

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

llvm/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir

llvm/test/CodeGen/AArch64/addsub_ext.ll

llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

llvm/test/CodeGen/AArch64/fadd-combines.ll

llvm/test/CodeGen/AArch64/i128-math.ll

llvm/test/CodeGen/AArch64/machine-combiner-madd.ll

llvm/test/CodeGen/AArch64/machine-combiner-subadd.ll

llvm/test/CodeGen/AArch64/madd-combiner.ll

llvm/test/CodeGen/AArch64/madd-lohi.ll

llvm/test/CodeGen/AArch64/mul-lohi.ll

llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll

[MachineCombiner, AArch64] Add a new pattern A-(B+C) => (A-B)-C to reduce latency
ClosedPublic