Diff 343657

llvm/lib/Target/ARM/ARMISelLowering.h

Show First 20 Lines • Show All 45 Lines • ▼ Show 20 Lines
class SelectionDAG;		class SelectionDAG;
class TargetLibraryInfo;		class TargetLibraryInfo;
class TargetMachine;		class TargetMachine;
class TargetRegisterInfo;		class TargetRegisterInfo;
class VectorType;		class VectorType;

namespace ARMISD {		namespace ARMISD {

// ARM Specific DAG Nodes		// ARM Specific DAG Nodes
		dmgreenUnsubmitted Done Reply Inline Actions .. for tail predicated loops. dmgreen: .. for tail predicated loops.
enum NodeType : unsigned {		enum NodeType : unsigned {
// Start the numbering where the builtin ops and target ops leave off.		// Start the numbering where the builtin ops and target ops leave off.
FIRST_NUMBER = ISD::BUILTIN_OP_END,		FIRST_NUMBER = ISD::BUILTIN_OP_END,

Wrapper, // Wrapper - A wrapper node for TargetConstantPool,		Wrapper, // Wrapper - A wrapper node for TargetConstantPool,
// TargetExternalSymbol, and TargetGlobalAddress.		// TargetExternalSymbol, and TargetGlobalAddress.
WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in		WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in
// PIC mode.		// PIC mode.
▲ Show 20 Lines • Show All 235 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {

// Pseudo-instruction representing a memory copy using ldm/stm		// Pseudo-instruction representing a memory copy using ldm/stm
// instructions.		// instructions.
MEMCPY,		MEMCPY,

// Pseudo-instruction representing a memory copy using a tail predicated		// Pseudo-instruction representing a memory copy using a tail predicated
// loop		// loop
MEMCPYLOOP,		MEMCPYLOOP,
		// Pseudo-instruction representing a memset using a tail predicated
		// loop
		MEMSETLOOP,

// V8.1MMainline condition select		// V8.1MMainline condition select
CSINV, // Conditional select invert.		CSINV, // Conditional select invert.
CSNEG, // Conditional select negate.		CSNEG, // Conditional select negate.
CSINC, // Conditional select increment.		CSINC, // Conditional select increment.

// Vector load N-element structure to all lanes:		// Vector load N-element structure to all lanes:
VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,		VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
▲ Show 20 Lines • Show All 651 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,797 Lines • ▼ Show 20 Lines	case ARMISD::FIRST_NUMBER:
MAKE_CASE(ARMISD::WLS)		MAKE_CASE(ARMISD::WLS)
MAKE_CASE(ARMISD::WLSSETUP)		MAKE_CASE(ARMISD::WLSSETUP)
MAKE_CASE(ARMISD::LE)		MAKE_CASE(ARMISD::LE)
MAKE_CASE(ARMISD::LOOP_DEC)		MAKE_CASE(ARMISD::LOOP_DEC)
MAKE_CASE(ARMISD::CSINV)		MAKE_CASE(ARMISD::CSINV)
MAKE_CASE(ARMISD::CSNEG)		MAKE_CASE(ARMISD::CSNEG)
MAKE_CASE(ARMISD::CSINC)		MAKE_CASE(ARMISD::CSINC)
MAKE_CASE(ARMISD::MEMCPYLOOP)		MAKE_CASE(ARMISD::MEMCPYLOOP)
		MAKE_CASE(ARMISD::MEMSETLOOP)
#undef MAKE_CASE		#undef MAKE_CASE
}		}
return nullptr;		return nullptr;
}		}

EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,		EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
EVT VT) const {		EVT VT) const {
if (!VT.isVector())		if (!VT.isVector())
▲ Show 20 Lines • Show All 9,286 Lines • ▼ Show 20 Lines

/// Adds logic in loop entry MBB to calculate loop iteration count and adds		/// Adds logic in loop entry MBB to calculate loop iteration count and adds
/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop		/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
static Register genTPEntry(MachineBasicBlock *TpEntry,		static Register genTPEntry(MachineBasicBlock *TpEntry,
MachineBasicBlock *TpLoopBody,		MachineBasicBlock *TpLoopBody,
MachineBasicBlock *TpExit, Register OpSizeReg,		MachineBasicBlock *TpExit, Register OpSizeReg,
const TargetInstrInfo *TII, DebugLoc Dl,		const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI) {		MachineRegisterInfo &MRI) {

// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.		// Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)		BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
.addUse(OpSizeReg)		.addUse(OpSizeReg)
.addImm(15)		.addImm(15)
.add(predOps(ARMCC::AL))		.add(predOps(ARMCC::AL))
.addReg(0);		.addReg(0);
		dmgreenUnsubmitted Done Reply Inline Actions It might be worth creating a dup in EmitTargetCodeForMemset and having this use the vector value it produced. dmgreen: It might be worth creating a dup in EmitTargetCodeForMemset and having this use the vector…

Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)		BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
.addUse(AddDestReg, RegState::Kill)		.addUse(AddDestReg, RegState::Kill)
.addImm(16)		.addImm(16)
.add(predOps(ARMCC::AL))		.add(predOps(ARMCC::AL))
.addReg(0);		.addReg(0);

Show All 18 Lines
/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and		/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
/// t2DoLoopEnd. These are used by later passes to generate tail predicated		/// t2DoLoopEnd. These are used by later passes to generate tail predicated
/// loops.		/// loops.
static void genTPLoopBody(MachineBasicBlock *TpLoopBody,		static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
MachineBasicBlock TpEntry, MachineBasicBlock TpExit,		MachineBasicBlock TpEntry, MachineBasicBlock TpExit,
const TargetInstrInfo *TII, DebugLoc Dl,		const TargetInstrInfo *TII, DebugLoc Dl,
MachineRegisterInfo &MRI, Register OpSrcReg,		MachineRegisterInfo &MRI, Register OpSrcReg,
Register OpDestReg, Register ElementCountReg,		Register OpDestReg, Register ElementCountReg,
Register TotalIterationsReg) {		Register TotalIterationsReg, bool IsMemcpy) {
		// First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
// First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop		// array, loop iteration counter, predication counter.
// iteration counter, predication counter Current position in the src array
Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		Register SrcPhiReg, CurrSrcReg;
Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		if (IsMemcpy) {
		// Current position in the src array
		SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
		CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)		BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
.addUse(OpSrcReg)		.addUse(OpSrcReg)
.addMBB(TpEntry)		.addMBB(TpEntry)
.addUse(CurrSrcReg)		.addUse(CurrSrcReg)
.addMBB(TpLoopBody);		.addMBB(TpLoopBody);
		}

// Current position in the dest array		// Current position in the dest array
Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);		Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)		BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
.addUse(OpDestReg)		.addUse(OpDestReg)
.addMBB(TpEntry)		.addMBB(TpEntry)
.addUse(CurrDestReg)		.addUse(CurrDestReg)
Show All 26 Lines	BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
.addReg(0);		.addReg(0);

BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)		BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
.addUse(PredCounterPhiReg)		.addUse(PredCounterPhiReg)
.addImm(16)		.addImm(16)
.add(predOps(ARMCC::AL))		.add(predOps(ARMCC::AL))
.addReg(0);		.addReg(0);

// VLDRB and VSTRB instructions, predicated using VPR		// VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);		Register SrcValueReg;
		if (IsMemcpy) {
		SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))		BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
.addDef(CurrSrcReg)		.addDef(CurrSrcReg)
.addDef(LoadedValueReg)		.addDef(SrcValueReg)
.addReg(SrcPhiReg)		.addReg(SrcPhiReg)
.addImm(16)		.addImm(16)
.addImm(ARMVCC::Then)		.addImm(ARMVCC::Then)
.addUse(VccrReg);		.addUse(VccrReg);
		} else
		SrcValueReg = OpSrcReg;

BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))		BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
.addDef(CurrDestReg)		.addDef(CurrDestReg)
.addUse(LoadedValueReg, RegState::Kill)		.addUse(SrcValueReg)
.addReg(DestPhiReg)		.addReg(DestPhiReg)
.addImm(16)		.addImm(16)
.addImm(ARMVCC::Then)		.addImm(ARMVCC::Then)
.addUse(VccrReg);		.addUse(VccrReg);

// Add the pseudoInstrs for decrementing the loop counter and marking the		// Add the pseudoInstrs for decrementing the loop counter and marking the
// end:t2DoLoopDec and t2DoLoopEnd		// end:t2DoLoopDec and t2DoLoopEnd
BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)		BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
.addUse(LoopCounterPhiReg)		.addUse(LoopCounterPhiReg)
.addImm(1);		.addImm(1);

BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))		BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
.addUse(RemainingLoopIterationsReg)		.addUse(RemainingLoopIterationsReg)
.addMBB(TpLoopBody);		.addMBB(TpLoopBody);

BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))		BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
.addMBB(TpExit)		.addMBB(TpExit)
.add(predOps(ARMCC::AL));		.add(predOps(ARMCC::AL));
}		}

MachineBasicBlock *		MachineBasicBlock *
ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,		ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {		MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();		const TargetInstrInfo *TII = Subtarget->getInstrInfo();
		dmgreenUnsubmitted Done Reply Inline Actions This and genTPLoopBodyMemcpy are very similar. Is it possible to combine them more? dmgreen: This and genTPLoopBodyMemcpy are very similar. Is it possible to combine them more?
DebugLoc dl = MI.getDebugLoc();		DebugLoc dl = MI.getDebugLoc();
		dmgreenUnsubmitted Done Reply Inline Actions Formatting. dmgreen: Formatting.
bool isThumb2 = Subtarget->isThumb2();		bool isThumb2 = Subtarget->isThumb2();
switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
default: {		default: {
MI.print(errs());		MI.print(errs());
llvm_unreachable("Unexpected instr type to insert");		llvm_unreachable("Unexpected instr type to insert");
}		}

// Thumb1 post-indexed loads are really just single-register LDMs.		// Thumb1 post-indexed loads are really just single-register LDMs.
case ARM::tLDR_postidx: {		case ARM::tLDR_postidx: {
MachineOperand Def(MI.getOperand(1));		MachineOperand Def(MI.getOperand(1));
BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))		BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
.add(Def) // Rn_wb		.add(Def) // Rn_wb
.add(MI.getOperand(2)) // Rn		.add(MI.getOperand(2)) // Rn
.add(MI.getOperand(3)) // PredImm		.add(MI.getOperand(3)) // PredImm
.add(MI.getOperand(4)) // PredReg		.add(MI.getOperand(4)) // PredReg
.add(MI.getOperand(0)) // Rt		.add(MI.getOperand(0)) // Rt
.cloneMemRefs(MI);		.cloneMemRefs(MI);
MI.eraseFromParent();		MI.eraseFromParent();
return BB;		return BB;
}		}

case ARM::MVE_MEMCPYLOOPINST: {		case ARM::MVE_MEMCPYLOOPINST:
		case ARM::MVE_MEMSETLOOPINST: {

// Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction		// Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
// into a Tail Predicated (TP) Loop. It adds the instructions to calculate		// into a Tail Predicated (TP) Loop. It adds the instructions to calculate
// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and		// the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
// adds the relevant instructions in the TP loop Body for generation of a		// adds the relevant instructions in the TP loop Body for generation of a
// WLSTP loop.		// WLSTP loop.

// Below is relevant portion of the CFG after the transformation.		// Below is relevant portion of the CFG after the transformation.
// The Machine Basic Blocks are shown along with branch conditions (in		// The Machine Basic Blocks are shown along with branch conditions (in
// brackets). Note that TP entry/exit MBBs depict the entry/exit of this		// brackets). Note that TP entry/exit MBBs depict the entry/exit of this
Show All 23 Lines	case ARM::MVE_MEMSETLOOPINST: {
// Allocate the required MBBs and add to parent function.		// Allocate the required MBBs and add to parent function.
MachineBasicBlock *TpEntry = BB;		MachineBasicBlock *TpEntry = BB;
MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();		MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
MachineBasicBlock *TpExit;		MachineBasicBlock *TpExit;

MF->push_back(TpLoopBody);		MF->push_back(TpLoopBody);

// If any instructions are present in the current block after		// If any instructions are present in the current block after
// MVE_MEMCPYLOOPINST, split the current block and move the instructions		// MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
// into the newly created exit block. If there are no instructions		// move the instructions into the newly created exit block. If there are no
// add an explicit branch to the FallThrough block and then split.		// instructions add an explicit branch to the FallThrough block and then
		// split.
//		//
// The split is required for two reasons:		// The split is required for two reasons:
// 1) A terminator(t2WhileLoopStart) will be placed at that site.		// 1) A terminator(t2WhileLoopStart) will be placed at that site.
// 2) Since a TPLoopBody will be added later, any phis in successive blocks		// 2) Since a TPLoopBody will be added later, any phis in successive blocks
// need to be updated. splitAt() already handles this.		// need to be updated. splitAt() already handles this.
TpExit = BB->splitAt(MI, false);		TpExit = BB->splitAt(MI);
if (TpExit == BB) {		if (TpExit == BB) {
assert(BB->canFallThrough() &&		assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
"Exit block must be FallThrough of the block containing memcpy");		"block containing memcpy/memset Pseudo");
TpExit = BB->getFallThrough();		TpExit = BB->getFallThrough();
		dmgreenUnsubmitted Not Done Reply Inline Actions Why has this changed from false to true? What happens if CPSR is live across the MVE_MEMCPYLOOPINST? WhileLoopStart and LoopEnd both clobber that physical register, incase they get reverted to subs; bne. Do we need to add the same clobber to these new MVE mem loop instructions? dmgreen: Why has this changed from false to true? What happens if CPSR is live across the…
		malharJAuthorUnsubmitted Done Reply Inline Actions Im a bit unclear on this, so I've still left this as UpdateLiveIns=true in the latest patchset. Inside splitAt(), I can see this relevant piece of code where: LiveRegs.addLiveOuts() will add the live outs of the original block. The for-loop will add the liveins for any physical register uses by the instructions in the newly split block. LivePhysRegs LiveRegs; if (UpdateLiveIns) { ... LiveRegs.addLiveOuts(this); for (auto I = rbegin(), E = Prev.getReverse(); I != E; ++I) LiveRegs.stepBackward(I); } ... if (UpdateLiveIns) addLiveIns(SplitBB, LiveRegs); The condition uses I != E (where E is the memcpy/set pseudo) ... so I suppose any liveness information from it is not added ? and in case it gets reverted and clobbers CPSR in a later pass, shouldn't that take care of updating the liveness information of successive blocks ? malharJ:* Im a bit unclear on this, so I've still left this as UpdateLiveIns=true in the latest…
BuildMI(BB, dl, TII->get(ARM::t2B))		BuildMI(BB, dl, TII->get(ARM::t2B))
.addMBB(TpExit)		.addMBB(TpExit)
.add(predOps(ARMCC::AL));		.add(predOps(ARMCC::AL));
TpExit = BB->splitAt(MI, false);		TpExit = BB->splitAt(MI);
}		}

// Add logic for iteration count		// Add logic for iteration count
Register TotalIterationsReg =		Register TotalIterationsReg =
genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);		genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);

// Add the vectorized (and predicated) loads/store instructions		// Add the vectorized (and predicated) loads/store instructions
		bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,		genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
OpDestReg, OpSizeReg, TotalIterationsReg);		OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);

// Required to avoid conflict with the MachineVerifier during testing.		// Required to avoid conflict with the MachineVerifier during testing.
Properties.reset(MachineFunctionProperties::Property::NoPHIs);		Properties.reset(MachineFunctionProperties::Property::NoPHIs);

// Connect the blocks		// Connect the blocks
TpEntry->addSuccessor(TpLoopBody);		TpEntry->addSuccessor(TpLoopBody);
TpLoopBody->addSuccessor(TpLoopBody);		TpLoopBody->addSuccessor(TpLoopBody);
TpLoopBody->addSuccessor(TpExit);		TpLoopBody->addSuccessor(TpExit);
▲ Show 20 Lines • Show All 8,710 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMInstrMVE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,871 Lines • ▼ Show 20 Lines

	let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {			let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
	def MVE_MEMCPYLOOPINST : PseudoInst<(outs),			def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
	(ins rGPR:$dst, rGPR:$src, rGPR:$sz),			(ins rGPR:$dst, rGPR:$src, rGPR:$sz),
	NoItinerary,			NoItinerary,
	[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;			[(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
	}			}

				def SDT_MVEMEMSETLOOPNODE
				: SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisVT<1, v16i8>, SDTCisVT<2, i32>]>;
				def MVE_MEMSETLOOPNODE : SDNode<"ARMISD::MEMSETLOOP", SDT_MVEMEMSETLOOPNODE,
				[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;

				let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
				def MVE_MEMSETLOOPINST : PseudoInst<(outs),
				(ins rGPR:$dst, MQPR:$src, rGPR:$sz),
				NoItinerary,
				[(MVE_MEMSETLOOPNODE rGPR:$dst, MQPR:$src, rGPR:$sz)]>;
				}

	def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;			def MVE_DLSTP_8 : MVE_DLSTP<"dlstp.8", 0b00>;
	def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;			def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
	def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;			def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
	def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>;			def MVE_DLSTP_64 : MVE_DLSTP<"dlstp.64", 0b11>;

	def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>;			def MVE_WLSTP_8 : MVE_WLSTP<"wlstp.8", 0b00>;
	def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>;			def MVE_WLSTP_16 : MVE_WLSTP<"wlstp.16", 0b01>;
	def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>;			def MVE_WLSTP_32 : MVE_WLSTP<"wlstp.32", 0b10>;
	▲ Show 20 Lines • Show All 547 Lines • Show Last 20 Lines

llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp

Show All 14 Lines
#include "llvm/CodeGen/SelectionDAG.h"		#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "arm-selectiondag-info"		#define DEBUG_TYPE "arm-selectiondag-info"

cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(		cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
"arm-memtransfer-tploop", cl::Hidden,		"arm-memtransfer-tploop", cl::Hidden,
		dmgreenUnsubmitted Done Reply Inline Actions One option for both is probably fine. dmgreen: One option for both is probably fine.
cl::desc("Control conversion of memcpy to "		cl::desc("Control conversion of memcpy to "
"Tail predicated loops (WLSTP)"),		"Tail predicated loops (WLSTP)"),
cl::init(TPLoop::ForceDisabled),		cl::init(TPLoop::ForceDisabled),
cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",		cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
"Don't convert memcpy to TP loop."),		"Don't convert memcpy to TP loop."),
clEnumValN(TPLoop::ForceEnabled, "force-enabled",		clEnumValN(TPLoop::ForceEnabled, "force-enabled",
"Always convert memcpy to TP loop."),		"Always convert memcpy to TP loop."),
clEnumValN(TPLoop::Allow, "allow",		clEnumValN(TPLoop::Allow, "allow",
▲ Show 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	CLI.setDebugLoc(dl)
TLI->getPointerTy(DAG.getDataLayout())),		TLI->getPointerTy(DAG.getDataLayout())),
std::move(Args))		std::move(Args))
.setDiscardResult();		.setDiscardResult();
std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);		std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);

return CallResult.second;		return CallResult.second;
}		}

SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(		static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
		dmgreenUnsubmitted Done Reply Inline Actions Maybe shouldGenerateInlineTPLoop would be a more descriptive name? dmgreen: Maybe shouldGenerateInlineTPLoop would be a more descriptive name?
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,		const SelectionDAG &DAG,
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,		ConstantSDNode *ConstantSize,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {		Align Alignment, bool IsMemcpy) {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
const SelectionDAG &DAG) {
auto &F = DAG.getMachineFunction().getFunction();		auto &F = DAG.getMachineFunction().getFunction();
if (!EnableMemtransferTPLoop)		if (!EnableMemtransferTPLoop)
return false;		return false;
if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)		if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
return true;		return true;
// Do not generate inline TP loop if optimizations is disabled,		// Do not generate inline TP loop if optimizations is disabled,
// or if optimization for size (-Os or -Oz) is on.		// or if optimization for size (-Os or -Oz) is on.
if (F.hasOptNone() \|\| F.hasOptSize())		if (F.hasOptNone() \|\| F.hasOptSize())
return false;		return false;
		dmgreenUnsubmitted Done Reply Inline Actions for -> For, probably with a full stop on the previous line. dmgreen: for -> For, probably with a full stop on the previous line.
// If cli option is unset		// If cli option is unset, for memset always generate inline TP.
		// For memcpy, check some conditions
		if (!IsMemcpy)
		return true;
if (!ConstantSize && Alignment >= Align(4))		if (!ConstantSize && Alignment >= Align(4))
		dmgreenUnsubmitted Done Reply Inline Actions Make sure you clang-format the patch. dmgreen: Make sure you clang-format the patch.
return true;		return true;
if (ConstantSize &&		if (ConstantSize &&
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&		ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
ConstantSize->getZExtValue() <		ConstantSize->getZExtValue() <
Subtarget.getMaxTPLoopInlineSizeThreshold())		Subtarget.getMaxMemcpyTPInlineSizeThreshold())
return true;		return true;
return false;		return false;
};		}

		SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
		SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
		SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
		MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
		const ARMSubtarget &Subtarget =
		DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
		ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))		if (Subtarget.hasMVEIntegerOps() &&
		shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,		return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
DAG.getZExtOrTrunc(Size, dl, MVT::i32));		DAG.getZExtOrTrunc(Size, dl, MVT::i32));

// Do repeated 4-byte loads and stores. To be improved.		// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.		// This requires 4-byte alignment.
if (Alignment < Align(4))		if (Alignment < Align(4))
return SDValue();		return SDValue();
// This requires the copy size to be a constant, preferably		// This requires the copy size to be a constant, preferably
▲ Show 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,		return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMMOVE);		Alignment.value(), RTLIB::MEMMOVE);
}		}

SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(		SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,		SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool isVolatile,		SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {		MachinePointerInfo DstPtrInfo) const {

		const ARMSubtarget &Subtarget =
		DAG.getMachineFunction().getSubtarget<ARMSubtarget>();

		ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);

		// Generate TP loop for llvm.memset
		if (Subtarget.hasMVEIntegerOps() &&
		dmgreenUnsubmitted Done Reply Inline Actions OptSize? dmgreen: OptSize?
		shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
		false)) {
		dmgreenUnsubmitted Done Reply Inline Actions It's best to create a shuffle vector or build vector, not a ARMISD::VDUP directly. That may optimize better in places. Is the input always an i8? dmgreen: It's best to create a shuffle vector or build vector, not a ARMISD::VDUP directly. That may…
		malharJAuthorUnsubmitted Done Reply Inline Actions I've made this update with a build vector. Have 2 minor queries: Why would shuffle vector be appropriate here (given that all we want to create is a vector of constants) ? Even though I'm not utilising vdup directly, just to understand better, why would the input need to be i8 to generate a v16i8 vector ... can the vector not be generated by a i32 source register ? malharJ: I've made this update with a build vector. Have 2 minor queries: - Why would shuffle vector be…
		dmgreenUnsubmitted Not Done Reply Inline Actions Do you mean "Why use a shuffle as opposed to a ARMISD::VDUP"? Normal codegen will usually start off by producing a shuffle, which will then be optimized and eventually turned into a VDUP. Because it works that way around there are not a lot of optimizations on VDUP directly, as can be seen with the constant becoming a VMOVimm. We could theoretically add them, but its added complexity and it's easier to just use the optimizations that are already present by starting from a shuffle. The input should be able to be an i32, but the types would need to be correct. Do we know the type of the value is always an i8? dmgreen: Do you mean "Why use a shuffle as opposed to a ARMISD::VDUP"? Normal codegen will usually start…
		malharJAuthorUnsubmitted Done Reply Inline Actions Ok. Thanks for clarifying about the shuffle vector. Do we know the type of the value is always an i8? I have a truncate operation on line 298 which ensures the input is i8. malharJ: Ok. Thanks for clarifying about the shuffle vector. > Do we know the type of the value is…
		dmgreenUnsubmitted Not Done Reply Inline Actions Yep, but if the value isn't an i8 it will discard some bits it should not. Something like a `@llvm.memset.p0i8.i32` or `@llvm.memset.p0i32.i32`, if they are valid. Is it possible to add an assert at least? dmgreen: Yep, but if the value isn't an i8 it will discard some bits it should not. Something like a…
		malharJAuthorUnsubmitted Done Reply Inline Actions So having a look at the language ref for llvm.memset suggests that the Src value is always an i8 (It may get zero extended before reaching here but that's not a problem) .. malharJ: So having a look at the language ref for [[ https://llvm.org/docs/LangRef.html#llvm-memset…
		Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
		dmgreenUnsubmitted Done Reply Inline Actions Does it need Src.getValue(0)? dmgreen: Does it need Src.getValue(0)?
		DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
		return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
		DAG.getZExtOrTrunc(Size, dl, MVT::i32));
		}

return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,		return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
Alignment.value(), RTLIB::MEMSET);		Alignment.value(), RTLIB::MEMSET);
}		}

llvm/lib/Target/ARM/ARMSubtarget.h

Show First 20 Lines • Show All 532 Lines • ▼ Show 20 Lines	ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
bool MinSize = false);		bool MinSize = false);

/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size		/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.		/// that still makes it profitable to inline the call.
unsigned getMaxInlineSizeThreshold() const {		unsigned getMaxInlineSizeThreshold() const {
return 64;		return 64;
}		}

/// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size		/// getMaxMemcpyTPInlineSizeThreshold - Returns the maximum size
/// that still makes it profitable to inline the call as a Tail		/// that still makes it profitable to inline a llvm.memcpy as a Tail
/// Predicated loop		/// Predicated loop.
unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }		/// This threshold should only be used for constant size inputs.
		unsigned getMaxMemcpyTPInlineSizeThreshold() const { return 128; }

/// ParseSubtargetFeatures - Parses features string setting specified		/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.		/// subtarget options. Definition of function is auto generated by tblgen.
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);		void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

/// initializeSubtargetDependencies - Initializes using a CPU and feature string		/// initializeSubtargetDependencies - Initializes using a CPU and feature string
/// so that we can use initializer lists for subtarget initialization.		/// so that we can use initializer lists for subtarget initialization.
ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);		ARMSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
▲ Show 20 Lines • Show All 385 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

Show First 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	for.body: ; preds = %entry, %for.body
%inc = add nuw nsw i32 %i.011, 1		%inc = add nuw nsw i32 %i.011, 1
%exitcond.not = icmp eq i32 %inc, %n		%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}		}

define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {		define void @test_memset(i32* nocapture %x, i32 %n, i32 %m) {
; CHECK-LABEL: test_memset:		; CHECK-LABEL: test_memset:
; CHECK: @ %bb.0: @ %entry		; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}		; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, r5, r6, r7, lr}		; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: cmp r1, #1		; CHECK-NEXT: cmp r1, #1
; CHECK-NEXT: blt .LBB1_3		; CHECK-NEXT: it lt
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader		; CHECK-NEXT: poplt {r4, pc}
; CHECK-NEXT: mov r4, r2		; CHECK-NEXT: .LBB1_1: @ %for.body.preheader
; CHECK-NEXT: mov r5, r1		; CHECK-NEXT: lsl.w r12, r2, #2
; CHECK-NEXT: mov r6, r0		; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: lsls r7, r2, #2		; CHECK-NEXT: b .LBB1_2
; CHECK-NEXT: .LBB1_2: @ %for.body		; CHECK-NEXT: .LBB1_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1		; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: mov r0, r6		; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
; CHECK-NEXT: mov r1, r4		; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: bl __aeabi_memclr4		; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: add r6, r7		; CHECK-NEXT: wlstp.8 lr, r3, .LBB1_3
; CHECK-NEXT: subs r5, #1		; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: bne .LBB1_2		; CHECK-NEXT: .LBB1_3: @ %for.body
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup		; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: add sp, #4		; CHECK-NEXT: add r0, r12
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}		; CHECK-NEXT: subs r1, #1
		; CHECK-NEXT: beq .LBB1_5
		; CHECK-NEXT: b .LBB1_2
		; CHECK-NEXT: .LBB1_4: @ Parent Loop BB1_2 Depth=1
		; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
		; CHECK-NEXT: vstrb.8 q0, [r4], #16
		; CHECK-NEXT: letp lr, .LBB1_4
		; CHECK-NEXT: b .LBB1_3
		; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
		; CHECK-NEXT: pop {r4, pc}
entry:		entry:
%cmp5 = icmp sgt i32 %n, 0		%cmp5 = icmp sgt i32 %n, 0
br i1 %cmp5, label %for.body, label %for.cond.cleanup		br i1 %cmp5, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body, %entry		for.cond.cleanup: ; preds = %for.body, %entry
ret void		ret void

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
▲ Show 20 Lines • Show All 185 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py

	; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - \| FileCheck %s			; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow %s -o 2>/dev/null - \| FileCheck %s

	!0 = !{i32 1, !"wchar_size", i32 4}			!0 = !{i32 1, !"wchar_size", i32 4}
	!1 = !{i32 1, !"min_enum_size", i32 4}			!1 = !{i32 1, !"min_enum_size", i32 4}
	!2 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 26f04d01a39a33d73fd23165c208b215bf5c350d)"}			!2 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 26f04d01a39a33d73fd23165c208b215bf5c350d)"}
	!3 = !{!4, !4, i64 0}			!3 = !{!4, !4, i64 0}
	!4 = !{!"int", !5, i64 0}			!4 = !{!"int", !5, i64 0}
	!5 = !{!"omnipotent char", !6, i64 0}			!5 = !{!"omnipotent char", !6, i64 0}
	!6 = !{!"Simple C/C++ TBAA"}			!6 = !{!"Simple C/C++ TBAA"}
	▲ Show 20 Lines • Show All 575 Lines • ▼ Show 20 Lines

	define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i16* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {			define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i16* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 {
	; CHECK-LABEL: arm_mat_mult_q15:			; CHECK-LABEL: arm_mat_mult_q15:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}			; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
	; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}			; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
	; CHECK-NEXT: .pad #4			; CHECK-NEXT: .pad #4
	; CHECK-NEXT: sub sp, #4			; CHECK-NEXT: sub sp, #4
	; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}			; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
	; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}			; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
	; CHECK-NEXT: .pad #24			; CHECK-NEXT: .pad #32
	; CHECK-NEXT: sub sp, #24			; CHECK-NEXT: sub sp, #32
	; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill			; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
	; CHECK-NEXT: cmp r3, #0			; CHECK-NEXT: cmp r3, #0
	; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill			; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
	; CHECK-NEXT: mov r0, r3			; CHECK-NEXT: mov r0, r3
	; CHECK-NEXT: itt ne			; CHECK-NEXT: itt ne
	; CHECK-NEXT: ldrne r0, [sp, #112]			; CHECK-NEXT: ldrne r0, [sp, #136]
	; CHECK-NEXT: cmpne r0, #0			; CHECK-NEXT: cmpne r0, #0
	; CHECK-NEXT: bne .LBB10_2			; CHECK-NEXT: bne .LBB10_2
	; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup			; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
	; CHECK-NEXT: add sp, #24			; CHECK-NEXT: add sp, #32
	; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}			; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
	; CHECK-NEXT: add sp, #4			; CHECK-NEXT: add sp, #4
	; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}			; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
	; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader			; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
	; CHECK-NEXT: ldr.w r9, [sp, #116]			; CHECK-NEXT: ldr.w r12, [sp, #140]
	; CHECK-NEXT: mov r6, r1			; CHECK-NEXT: movs r7, #1
	; CHECK-NEXT: movs r1, #1			; CHECK-NEXT: mov.w r11, #0
	; CHECK-NEXT: mov r11, r2			; CHECK-NEXT: vmov.i32 q0, #0x0
	; CHECK-NEXT: bic r10, r9, #3			; CHECK-NEXT: bic r2, r12, #3
	; CHECK-NEXT: mov.w r8, #0			; CHECK-NEXT: subs r3, r2, #4
	; CHECK-NEXT: sub.w r0, r10, #4			; CHECK-NEXT: add.w r0, r7, r3, lsr #2
	; CHECK-NEXT: add.w r0, r1, r0, lsr #2			; CHECK-NEXT: ldr r7, [sp, #136]
	; CHECK-NEXT: ldr r1, [sp, #112]			; CHECK-NEXT: adr r3, .LCPI10_0
	; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
	; CHECK-NEXT: lsl.w r0, r9, #1
	; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
	; CHECK-NEXT: adr r0, .LCPI10_0
	; CHECK-NEXT: vdup.32 q4, r1
	; CHECK-NEXT: vldrw.u32 q5, [r0]
	; CHECK-NEXT: lsls r4, r1, #1
	; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
	; CHECK-NEXT: vshl.i32 q6, q4, #2
	; CHECK-NEXT: movs r1, #0
	; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill			; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
				; CHECK-NEXT: lsl.w r0, r12, #1
				; CHECK-NEXT: vdup.32 q1, r7
				; CHECK-NEXT: vldrw.u32 q2, [r3]
				; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
				; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
				; CHECK-NEXT: lsls r6, r7, #1
				; CHECK-NEXT: vshl.i32 q3, q1, #2
				; CHECK-NEXT: movs r3, #0
				; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
	; CHECK-NEXT: b .LBB10_5			; CHECK-NEXT: b .LBB10_5
	; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader			; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
	; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1			; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
	; CHECK-NEXT: add.w r0, r11, r12, lsl #1			; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
	; CHECK-NEXT: mov r1, r4			; CHECK-NEXT: add.w r3, r0, r5, lsl #1
	; CHECK-NEXT: bl __aeabi_memclr			; CHECK-NEXT: mov r5, r6
				; CHECK-NEXT: wlstp.8 lr, r5, .LBB10_4
				; CHECK-NEXT: b .LBB10_15
	; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us			; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
	; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1			; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
	; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
	; CHECK-NEXT: add r8, r9
	; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
	; CHECK-NEXT: add r1, r0
	; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
	; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
	; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload			; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
	; CHECK-NEXT: adds r1, #1			; CHECK-NEXT: add r11, r12
	; CHECK-NEXT: cmp r1, r0			; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
				; CHECK-NEXT: add r3, r0
				; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
				; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
				; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
				; CHECK-NEXT: adds r3, #1
				; CHECK-NEXT: cmp r3, r0
	; CHECK-NEXT: beq .LBB10_1			; CHECK-NEXT: beq .LBB10_1
	; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us			; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
	; CHECK-NEXT: @ =>This Loop Header: Depth=1			; CHECK-NEXT: @ =>This Loop Header: Depth=1
	; CHECK-NEXT: @ Child Loop BB10_8 Depth 2			; CHECK-NEXT: @ Child Loop BB10_8 Depth 2
	; CHECK-NEXT: @ Child Loop BB10_11 Depth 3			; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
	; CHECK-NEXT: @ Child Loop BB10_14 Depth 3			; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
	; CHECK-NEXT: ldr r0, [sp, #112]			; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
	; CHECK-NEXT: cmp.w r9, #0			; CHECK-NEXT: mul r5, r3, r7
	; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill			; CHECK-NEXT: cmp.w r12, #0
	; CHECK-NEXT: mul r12, r1, r0			; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
	; CHECK-NEXT: beq .LBB10_3			; CHECK-NEXT: beq .LBB10_3
	; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader			; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
	; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1			; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
	; CHECK-NEXT: movs r1, #0			; CHECK-NEXT: mov.w r8, #0
	; CHECK-NEXT: b .LBB10_8			; CHECK-NEXT: b .LBB10_8
	; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us			; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
	; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2			; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
	; CHECK-NEXT: ldr r0, [sp, #112]			; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
	; CHECK-NEXT: add.w r3, r1, r12			; CHECK-NEXT: add.w r0, r8, r5
	; CHECK-NEXT: adds r1, #1			; CHECK-NEXT: add.w r8, r8, #1
	; CHECK-NEXT: cmp r1, r0			; CHECK-NEXT: cmp r8, r7
	; CHECK-NEXT: strh.w r2, [r11, r3, lsl #1]			; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
	; CHECK-NEXT: beq .LBB10_4			; CHECK-NEXT: beq .LBB10_4
	; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us			; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
	; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1			; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
	; CHECK-NEXT: @ => This Loop Header: Depth=2			; CHECK-NEXT: @ => This Loop Header: Depth=2
	; CHECK-NEXT: @ Child Loop BB10_11 Depth 3			; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
	; CHECK-NEXT: @ Child Loop BB10_14 Depth 3			; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
	; CHECK-NEXT: cmp.w r9, #3			; CHECK-NEXT: cmp.w r12, #3
	; CHECK-NEXT: bhi .LBB10_10			; CHECK-NEXT: bhi .LBB10_10
	; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2			; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2
	; CHECK-NEXT: movs r7, #0			; CHECK-NEXT: movs r4, #0
	; CHECK-NEXT: movs r2, #0			; CHECK-NEXT: mov.w r10, #0
	; CHECK-NEXT: b .LBB10_13			; CHECK-NEXT: b .LBB10_13
	; CHECK-NEXT: .LBB10_10: @ %vector.ph			; CHECK-NEXT: .LBB10_10: @ %vector.ph
	; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2			; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
	; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload			; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
	; CHECK-NEXT: vmov q1, q4			; CHECK-NEXT: vmov q5, q1
	; CHECK-NEXT: vmov.i32 q0, #0x0			; CHECK-NEXT: vmov.i32 q4, #0x0
	; CHECK-NEXT: vmlas.u32 q1, q5, r1			; CHECK-NEXT: vmlas.u32 q5, q2, r8
	; CHECK-NEXT: dls lr, r0			; CHECK-NEXT: dls lr, r0
	; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload			; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
	; CHECK-NEXT: .LBB10_11: @ %vector.body			; CHECK-NEXT: .LBB10_11: @ %vector.body
	; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1			; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
	; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2			; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
	; CHECK-NEXT: @ => This Inner Loop Header: Depth=3			; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
	; CHECK-NEXT: vadd.i32 q2, q1, q6			; CHECK-NEXT: vadd.i32 q6, q5, q3
	; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1]			; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
	; CHECK-NEXT: vldrh.s32 q1, [r2], #8			; CHECK-NEXT: vldrh.s32 q5, [r3], #8
	; CHECK-NEXT: vmul.i32 q1, q3, q1			; CHECK-NEXT: vmul.i32 q5, q7, q5
	; CHECK-NEXT: vadd.i32 q0, q1, q0			; CHECK-NEXT: vadd.i32 q4, q5, q4
	; CHECK-NEXT: vmov q1, q2			; CHECK-NEXT: vmov q5, q6
	; CHECK-NEXT: le lr, .LBB10_11			; CHECK-NEXT: le lr, .LBB10_11
	; CHECK-NEXT: @ %bb.12: @ %middle.block			; CHECK-NEXT: @ %bb.12: @ %middle.block
	; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2			; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
	; CHECK-NEXT: vaddv.u32 r2, q0			; CHECK-NEXT: vaddv.u32 r10, q4
	; CHECK-NEXT: cmp r10, r9			; CHECK-NEXT: cmp r2, r12
	; CHECK-NEXT: mov r7, r10			; CHECK-NEXT: mov r4, r2
	; CHECK-NEXT: beq .LBB10_7			; CHECK-NEXT: beq .LBB10_7
	; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader			; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
	; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2			; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
	; CHECK-NEXT: ldr r0, [sp, #112]			; CHECK-NEXT: mla r3, r7, r4, r8
	; CHECK-NEXT: add.w r5, r8, r7			; CHECK-NEXT: add.w r0, r11, r4
	; CHECK-NEXT: sub.w lr, r9, r7			; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
	; CHECK-NEXT: mla r3, r0, r7, r1			; CHECK-NEXT: sub.w lr, r12, r4
	; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload			; CHECK-NEXT: add.w r9, r7, r0, lsl #1
	; CHECK-NEXT: add.w r5, r0, r5, lsl #1			; CHECK-NEXT: ldr r7, [sp, #136]
	; CHECK-NEXT: add.w r3, r6, r3, lsl #1			; CHECK-NEXT: add.w r3, r1, r3, lsl #1
	; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us			; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
	; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1			; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
	; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2			; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
	; CHECK-NEXT: @ => This Inner Loop Header: Depth=3			; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
	; CHECK-NEXT: ldrsh.w r0, [r3]			; CHECK-NEXT: ldrsh.w r4, [r3]
	; CHECK-NEXT: add r3, r4			; CHECK-NEXT: add r3, r6
	; CHECK-NEXT: ldrsh r7, [r5], #2			; CHECK-NEXT: ldrsh r0, [r9], #2
	; CHECK-NEXT: smlabb r2, r0, r7, r2			; CHECK-NEXT: smlabb r10, r4, r0, r10
	; CHECK-NEXT: le lr, .LBB10_14			; CHECK-NEXT: le lr, .LBB10_14
	; CHECK-NEXT: b .LBB10_7			; CHECK-NEXT: b .LBB10_7
				; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1
				; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
				; CHECK-NEXT: vstrb.8 q0, [r3], #16
				; CHECK-NEXT: letp lr, .LBB10_15
				; CHECK-NEXT: b .LBB10_4
	; CHECK-NEXT: .p2align 4			; CHECK-NEXT: .p2align 4
	; CHECK-NEXT: @ %bb.15:			; CHECK-NEXT: @ %bb.16:
	; CHECK-NEXT: .LCPI10_0:			; CHECK-NEXT: .LCPI10_0:
	; CHECK-NEXT: .long 0 @ 0x0			; CHECK-NEXT: .long 0 @ 0x0
	; CHECK-NEXT: .long 1 @ 0x1			; CHECK-NEXT: .long 1 @ 0x1
	; CHECK-NEXT: .long 2 @ 0x2			; CHECK-NEXT: .long 2 @ 0x2
	; CHECK-NEXT: .long 3 @ 0x3			; CHECK-NEXT: .long 3 @ 0x3
	entry:			entry:
	%cmp48 = icmp eq i32 %n, 0			%cmp48 = icmp eq i32 %n, 0
	br i1 %cmp48, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph			br i1 %cmp48, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
	▲ Show 20 Lines • Show All 360 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-phireg.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - \| FileCheck %s			; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -verify-machineinstrs %s -o - \| FileCheck %s

	; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.			; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.

	define arm_aapcs_vfpcc void @k() {			define arm_aapcs_vfpcc void @k() {
	; CHECK-LABEL: k:			; CHECK-LABEL: k:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}			; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
	; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}			; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
	▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines
	@c = dso_local global i32 2, align 4			@c = dso_local global i32 2, align 4
	@d = dso_local global i32 2, align 4			@d = dso_local global i32 2, align 4

	define dso_local i32 @e() #0 {			define dso_local i32 @e() #0 {
	; CHECK-LABEL: e:			; CHECK-LABEL: e:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}			; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
	; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}			; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
	; CHECK-NEXT: .vsave {d8, d9, d10, d11}			; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
	; CHECK-NEXT: vpush {d8, d9, d10, d11}			; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
	; CHECK-NEXT: .pad #392			; CHECK-NEXT: .pad #416
	; CHECK-NEXT: sub sp, #392			; CHECK-NEXT: sub sp, #416
	; CHECK-NEXT: movw r9, :lower16:.L_MergedGlobals			; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
	; CHECK-NEXT: vldr s0, .LCPI1_0			; CHECK-NEXT: vldr s12, .LCPI1_0
	; CHECK-NEXT: movt r9, :upper16:.L_MergedGlobals			; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
	; CHECK-NEXT: vldr s3, .LCPI1_1			; CHECK-NEXT: vldr s15, .LCPI1_1
	; CHECK-NEXT: mov r7, r9			; CHECK-NEXT: mov r3, r7
	; CHECK-NEXT: mov r5, r9			; CHECK-NEXT: mov r4, r7
	; CHECK-NEXT: ldr r0, [r7, #4]!			; CHECK-NEXT: ldr r0, [r3, #4]!
	; CHECK-NEXT: movw r4, :lower16:e			; CHECK-NEXT: movw r2, :lower16:e
	; CHECK-NEXT: ldr r1, [r5, #8]!			; CHECK-NEXT: ldr r6, [r4, #8]!
	; CHECK-NEXT: movt r4, :upper16:e			; CHECK-NEXT: vmov r5, s15
	; CHECK-NEXT: vmov r6, s3			; CHECK-NEXT: vmov s13, r3
	; CHECK-NEXT: vdup.32 q4, r7			; CHECK-NEXT: vmov.i32 q0, #0x0
	; CHECK-NEXT: vmov s1, r7			; CHECK-NEXT: movt r2, :upper16:e
	; CHECK-NEXT: vmov q1[2], q1[0], r5, r5			; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
	; CHECK-NEXT: vmov s9, r4			; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
	; CHECK-NEXT: vmov q1[3], q1[1], r6, r4			; CHECK-NEXT: vmov s21, r2
	; CHECK-NEXT: vmov.f32 s2, s1			; CHECK-NEXT: vmov.f32 s14, s13
	; CHECK-NEXT: vmov q3, q4			; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
	; CHECK-NEXT: vmov.f32 s8, s0			; CHECK-NEXT: vmov.f32 s20, s12
	; CHECK-NEXT: vmov q5, q4			; CHECK-NEXT: vdup.32 q7, r3
	; CHECK-NEXT: vmov.f32 s10, s1			; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
	; CHECK-NEXT: vstrw.32 q1, [sp, #76]			; CHECK-NEXT: vmov.f32 s22, s13
	; CHECK-NEXT: vmov q1[2], q1[0], r7, r6			; CHECK-NEXT: vstrw.32 q0, [sp, #100]
	; CHECK-NEXT: mov.w r8, #4			; CHECK-NEXT: vmov q0, q7
	; CHECK-NEXT: mov.w r10, #0			; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
	; CHECK-NEXT: vmov q1[3], q1[1], r7, r4			; CHECK-NEXT: vmov q4, q7
	; CHECK-NEXT: vmov.32 q3[0], r4			; CHECK-NEXT: vmov.32 q0[0], r2
	; CHECK-NEXT: vmov.32 q5[1], r4			; CHECK-NEXT: vmov.32 q7[1], r2
	; CHECK-NEXT: str r1, [r0]			; CHECK-NEXT: vmov.f32 s23, s15
	; CHECK-NEXT: vmov.f32 s11, s3
	; CHECK-NEXT: movs r1, #64			; CHECK-NEXT: movs r1, #64
	; CHECK-NEXT: strh.w r8, [sp, #390]			; CHECK-NEXT: str r0, [sp, #48]
	; CHECK-NEXT: strd r0, r10, [sp, #24]
	; CHECK-NEXT: vstrw.32 q0, [sp, #44]
	; CHECK-NEXT: str r0, [r0]
	; CHECK-NEXT: vstrw.32 q2, [r0]
	; CHECK-NEXT: vstrw.32 q5, [r0]			; CHECK-NEXT: vstrw.32 q5, [r0]
	; CHECK-NEXT: vstrw.32 q3, [r0]			; CHECK-NEXT: str r6, [r0]
	; CHECK-NEXT: vstrw.32 q1, [r0]			; CHECK-NEXT: vstrw.32 q7, [r0]
	; CHECK-NEXT: bl __aeabi_memclr4			; CHECK-NEXT: str r0, [r0]
	; CHECK-NEXT: vmov q0[2], q0[0], r5, r7
	; CHECK-NEXT: vmov q1[2], q1[0], r7, r7
	; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
	; CHECK-NEXT: vmov q1[3], q1[1], r5, r6
	; CHECK-NEXT: vmov.32 q4[0], r10
	; CHECK-NEXT: vstrw.32 q0, [r0]			; CHECK-NEXT: vstrw.32 q0, [r0]
	; CHECK-NEXT: str.w r10, [r9]			; CHECK-NEXT: vstrw.32 q6, [r0]
	; CHECK-NEXT: vstrw.32 q4, [r0]			; CHECK-NEXT: mov.w r8, #0
				; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
				; CHECK-NEXT: vmov q2[2], q2[0], r3, r3
				; CHECK-NEXT: mov.w r12, #4
				malharJAuthorUnsubmitted Done Reply Inline Actions @dmgreen , This seems to be happening because of a 16-byte spill ( line 169 ). And it's being generated after I updated the code to create the VDUP in EmitTargetCodeForMemset() instead of during the MIR level transform. I'm not sure why the spill is happening (not really familiar with RA), but do you think it's worth investigating ? It doesnt seem to happen in the simpler tests. malharJ: @dmgreen , This seems to be happening because of a 16-byte spill ( line 169 ). And it's…
				dmgreenUnsubmitted Done Reply Inline Actions Yeah, this test might be difficult like that, it's designed to spill a lot. It's probably fine so long as we don't see it happening in other places. dmgreen: Yeah, this test might be difficult like that, it's designed to spill a lot. It's probably fine…
				; CHECK-NEXT: vmov q1[3], q1[1], r2, r4
				; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
				; CHECK-NEXT: vmov.32 q4[0], r8
				; CHECK-NEXT: @ implicit-def: $r2
				; CHECK-NEXT: str.w r8, [sp, #52]
				; CHECK-NEXT: strh.w r12, [sp, #414]
				; CHECK-NEXT: vstrw.32 q3, [sp, #68]
				; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
				; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
				; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
				; CHECK-NEXT: vstrb.8 q0, [r2], #16
				; CHECK-NEXT: letp lr, .LBB1_1
				; CHECK-NEXT: .LBB1_2: @ %entry
	; CHECK-NEXT: vstrw.32 q1, [r0]			; CHECK-NEXT: vstrw.32 q1, [r0]
	; CHECK-NEXT: str.w r8, [sp, #308]			; CHECK-NEXT: str.w r8, [r7]
	; CHECK-NEXT: .LBB1_1: @ %for.cond			; CHECK-NEXT: vstrw.32 q4, [r0]
				; CHECK-NEXT: vstrw.32 q2, [r0]
				; CHECK-NEXT: str.w r12, [sp, #332]
				; CHECK-NEXT: .LBB1_3: @ %for.cond
	; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1			; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
	; CHECK-NEXT: b .LBB1_1			; CHECK-NEXT: b .LBB1_3
	; CHECK-NEXT: .p2align 2			; CHECK-NEXT: .p2align 2
	; CHECK-NEXT: @ %bb.2:			; CHECK-NEXT: @ %bb.4:
	; CHECK-NEXT: .LCPI1_0:			; CHECK-NEXT: .LCPI1_0:
	; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45			; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45
	; CHECK-NEXT: .LCPI1_1:			; CHECK-NEXT: .LCPI1_1:
	; CHECK-NEXT: .long 0x00000000 @ float 0			; CHECK-NEXT: .long 0x00000000 @ float 0
	entry:			entry:
	%f = alloca i16, align 2			%f = alloca i16, align 2
	%g = alloca [3 x [8 x [4 x i16*]]], align 4			%g = alloca [3 x [8 x [4 x i16*]]], align 4
	store i16 4, i16* %f, align 2			store i16 4, i16* %f, align 2
	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

llvm/test/CodeGen/Thumb2/mve-tp-loop.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - \| FileCheck %s		; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - \| FileCheck %s

; Check that WLSTP loop is not generated for alignment < 4		; Check that WLSTP loop is not generated for alignment < 4
; void test1(char* dest, char* src, int n){		; void test1(char* dest, char* src, int n){
; memcpy(dest, src, n);		; memcpy(dest, src, n);
; }		; }

declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1		declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1		declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
		declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)

define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){		define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
; CHECK-LABEL: test1:		; CHECK-LABEL: test1:
; CHECK: @ %bb.0: @ %entry		; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}		; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}		; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: bl __aeabi_memcpy		; CHECK-NEXT: bl __aeabi_memcpy
; CHECK-NEXT: pop {r7, pc}		; CHECK-NEXT: pop {r7, pc}
▲ Show 20 Lines • Show All 257 Lines • ▼ Show 20 Lines	for.body: ; preds = %for.body, %prehead
%inc = add nuw nsw i32 %i.09, 2		%inc = add nuw nsw i32 %i.09, 2
%exitcond.not = icmp eq i32 %inc, %n		%exitcond.not = icmp eq i32 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %entry		for.cond.cleanup: ; preds = %entry
ret void		ret void
}		}

		; Check that WLSTP loop is generated for simplest case of align = 1
		define void @test12(i8* %X, i8 zeroext %c, i32 %n) {
		; CHECK-LABEL: test12:
		; CHECK: @ %bb.0: @ %entry
		; CHECK-NEXT: .save {r7, lr}
		; CHECK-NEXT: push {r7, lr}
		; CHECK-NEXT: vdup.8 q0, r1
		; CHECK-NEXT: wlstp.8 lr, r2, .LBB11_2
		; CHECK-NEXT: .LBB11_1: @ =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: vstrb.8 q0, [r0], #16
		; CHECK-NEXT: letp lr, .LBB11_1
		; CHECK-NEXT: .LBB11_2: @ %entry
		; CHECK-NEXT: pop {r7, pc}
		entry:
		call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
		ret void
		}


		; Check that WLSTP loop is generated for alignment >= 4
		define void @test13(i32* %X, i8 zeroext %c, i32 %n) {
		; CHECK-LABEL: test13:
		; CHECK: @ %bb.0: @ %entry
		; CHECK-NEXT: .save {r7, lr}
		; CHECK-NEXT: push {r7, lr}
		; CHECK-NEXT: vdup.8 q0, r1
		; CHECK-NEXT: wlstp.8 lr, r2, .LBB12_2
		; CHECK-NEXT: .LBB12_1: @ =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: vstrb.8 q0, [r0], #16
		; CHECK-NEXT: letp lr, .LBB12_1
		; CHECK-NEXT: .LBB12_2: @ %entry
		; CHECK-NEXT: pop {r7, pc}
		entry:
		%0 = bitcast i32* %X to i8*
		call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
		ret void
		}


		; Checks that transform correctly handles input with some arithmetic on input arguments.
		; void test14(int* X, char c, int n)
		; {
		; memset(X+2, c, (n*2)+10);
		; }

		define void @test14(i32* %X, i8 zeroext %c, i32 %n) {
		; CHECK-LABEL: test14:
		; CHECK: @ %bb.0: @ %entry
		; CHECK-NEXT: .save {r7, lr}
		; CHECK-NEXT: push {r7, lr}
		; CHECK-NEXT: movs r3, #10
		; CHECK-NEXT: add.w r2, r3, r2, lsl #1
		; CHECK-NEXT: vdup.8 q0, r1
		; CHECK-NEXT: adds r0, #8
		; CHECK-NEXT: wlstp.8 lr, r2, .LBB13_2
		; CHECK-NEXT: .LBB13_1: @ =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: vstrb.8 q0, [r0], #16
		; CHECK-NEXT: letp lr, .LBB13_1
		; CHECK-NEXT: .LBB13_2: @ %entry
		; CHECK-NEXT: pop {r7, pc}
		entry:
		%add.ptr = getelementptr inbounds i32, i32* %X, i32 2
		%0 = bitcast i32* %add.ptr to i8*
		%mul = shl nsw i32 %n, 1
		%add = add nsw i32 %mul, 10
		call void @llvm.memset.p0i8.i32(i8* nonnull align 4 %0, i8 %c, i32 %add, i1 false)
		ret void
		}




		; Checks that transform handles for-loops (that get implicitly converted to memset)
		; void test15(int* X, char Y, int n){
		; for(int i = 0; i < n; ++i){
		; X[i] = c;
		; }
		; }

		define void @test15(i8* nocapture %X, i8 zeroext %c, i32 %n) {
		; CHECK-LABEL: test15:
		; CHECK: @ %bb.0: @ %entry
		; CHECK-NEXT: cmp r2, #1
		; CHECK-NEXT: it lt
		; CHECK-NEXT: bxlt lr
		; CHECK-NEXT: .LBB14_1: @ %for.body.preheader
		; CHECK-NEXT: .save {r7, lr}
		; CHECK-NEXT: push {r7, lr}
		; CHECK-NEXT: vdup.8 q0, r1
		; CHECK-NEXT: wlstp.8 lr, r2, .LBB14_3
		; CHECK-NEXT: .LBB14_2: @ =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: vstrb.8 q0, [r0], #16
		; CHECK-NEXT: letp lr, .LBB14_2
		; CHECK-NEXT: .LBB14_3: @ %for.body.preheader
		; CHECK-NEXT: pop.w {r7, lr}
		; CHECK-NEXT: bx lr
		entry:
		%cmp4 = icmp sgt i32 %n, 0
		br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup

		for.body.preheader: ; preds = %entry
		call void @llvm.memset.p0i8.i32(i8* align 4 %X, i8 %c, i32 %n, i1 false)
		br label %for.cond.cleanup

		for.cond.cleanup: ; preds = %for.body.preheader, %entry
		ret void
		}

		; Checks that transform handles case with 0 as src value. No difference is expected.
		define void @test16(i32* %X, i8 zeroext %c, i32 %n) {
		; CHECK-LABEL: test16:
		; CHECK: @ %bb.0: @ %entry
		; CHECK-NEXT: .save {r7, lr}
		; CHECK-NEXT: push {r7, lr}
		; CHECK-NEXT: vmov.i32 q0, #0x0
		; CHECK-NEXT: wlstp.8 lr, r2, .LBB15_2
		; CHECK-NEXT: .LBB15_1: @ =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: vstrb.8 q0, [r0], #16
		; CHECK-NEXT: letp lr, .LBB15_1
		; CHECK-NEXT: .LBB15_2: @ %entry
		; CHECK-NEXT: pop {r7, pc}
		entry:
		%0 = bitcast i32* %X to i8*
		call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 %n, i1 false)
		ret void
		}

attributes #0 = { noinline optnone }		attributes #0 = { noinline optnone }
attributes #1 = { optsize }		attributes #1 = { optsize }

llvm/test/CodeGen/Thumb2/mve-tp-loop.mir

# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py		# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir --verify-machineinstrs -run-pass=finalize-isel %s -o - \| FileCheck %s		# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir --verify-machineinstrs -run-pass=finalize-isel %s -o - \| FileCheck %s
--- \|		--- \|
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"		target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "arm-arm-none-eabi"		target triple = "arm-arm-none-eabi"

; Function Attrs: argmemonly nofree nosync nounwind willreturn		; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)		declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
		; Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
		declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)

define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {		define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
entry:		entry:
%0 = bitcast i32* %X to i8*		%0 = bitcast i32* %X to i8*
%1 = bitcast i32* %Y to i8*		%1 = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)		call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
ret void		ret void
}		}

define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {		define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
entry:		entry:
%cmp6 = icmp sgt i32 %n, 0		%cmp6 = icmp sgt i32 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup		br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry		for.body.preheader: ; preds = %entry
%X.bits = bitcast i32* %X to i8*		%X.bits = bitcast i32* %X to i8*
%Y.bits = bitcast i32* %Y to i8*		%Y.bits = bitcast i32* %Y to i8*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)		call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
br label %for.cond.cleanup		br label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body.preheader, %entry		for.cond.cleanup: ; preds = %for.body.preheader, %entry
ret void		ret void
}		}

		define void @test3(i32* nocapture %X, i8 zeroext %c, i32 %n) {
		entry:
		%0 = bitcast i32* %X to i8*
		tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 %c, i32 %n, i1 false)
		ret void
		}


		define void @test4(i8* nocapture %X, i8 zeroext %c, i32 %n) {
		entry:
		%cmp4 = icmp sgt i32 %n, 0
		br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup

		for.body.preheader: ; preds = %entry
		call void @llvm.memset.p0i8.i32(i8* align 1 %X, i8 %c, i32 %n, i1 false)
		br label %for.cond.cleanup

		for.cond.cleanup: ; preds = %for.body.preheader, %entry
		ret void
		}

...		...
---		---
name: test1		name: test1
tracksRegLiveness: true		tracksRegLiveness: true
body: \|		body: \|
bb.0.entry:		bb.0.entry:
liveins: $r0, $r1, $r2		liveins: $r0, $r1, $r2

Show All 10 Lines	bb.0.entry:
; CHECK: .1:		; CHECK: .1:
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1		; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1		; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1		; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1		; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg		; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg		; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]		; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]		; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1		; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr		; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg		; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
; CHECK: .2.entry:		; CHECK: .2.entry:
; CHECK: tBX_RET 14 /* CC::al */, $noreg		; CHECK: tBX_RET 14 /* CC::al */, $noreg
%2:rgpr = COPY $r2		%2:rgpr = COPY $r2
%1:rgpr = COPY $r1		%1:rgpr = COPY $r1
%0:rgpr = COPY $r0		%0:rgpr = COPY $r0
Show All 24 Lines	body: \|
; CHECK: bb.3:		; CHECK: bb.3:
; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3		; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3		; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3		; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3		; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg		; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg		; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]		; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]		; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1		; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr		; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg		; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
; CHECK: bb.4.for.body.preheader:		; CHECK: bb.4.for.body.preheader:
; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg		; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
; CHECK: bb.2.for.cond.cleanup:		; CHECK: bb.2.for.cond.cleanup:
; CHECK: tBX_RET 14 /* CC::al */, $noreg		; CHECK: tBX_RET 14 /* CC::al */, $noreg
bb.0.entry:		bb.0.entry:
Show All 11 Lines	bb.1.for.body.preheader:
successors: %bb.2(0x80000000)		successors: %bb.2(0x80000000)

MVE_MEMCPYLOOPINST %0, %1, %2		MVE_MEMCPYLOOPINST %0, %1, %2

bb.2.for.cond.cleanup:		bb.2.for.cond.cleanup:
tBX_RET 14 /* CC::al */, $noreg		tBX_RET 14 /* CC::al */, $noreg

...		...
		---
		name: test3
		tracksRegLiveness: true
		body: \|
		bb.0.entry:
		liveins: $r0, $r1, $r2

		; CHECK-LABEL: name: test3
		; CHECK: liveins: $r0, $r1, $r2
		; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
		; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
		; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
		; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
		; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
		; CHECK: .1:
		; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %8, %bb.1
		; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %10, %bb.1
		; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %12, %bb.1
		; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
		; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
		; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
		; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
		; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
		; CHECK: .2.entry:
		; CHECK: tBX_RET 14 /* CC::al */, $noreg
		%2:rgpr = COPY $r2
		%1:mqpr = COPY $r1
		%0:rgpr = COPY $r0
		MVE_MEMSETLOOPINST %0, %1, %2
		tBX_RET 14 /* CC::al */, $noreg

		...
		---
		name: test4
		alignment: 2
		tracksRegLiveness: true
		body: \|
		; CHECK-LABEL: name: test4
		; CHECK: bb.0.entry:
		; CHECK: successors: %bb.1(0x50000000), %bb.2(0x30000000)
		; CHECK: liveins: $r0, $r1, $r2
		; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
		; CHECK: [[COPY1:%[0-9]+]]:mqpr = COPY $r1
		; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
		; CHECK: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
		; CHECK: t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
		; CHECK: t2B %bb.1, 14 /* CC::al */, $noreg
		; CHECK: bb.1.for.body.preheader:
		; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
		; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
		; CHECK: bb.3:
		; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %8, %bb.3
		; CHECK: [[PHI1:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %10, %bb.3
		; CHECK: [[PHI2:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %12, %bb.3
		; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI2]], 0, $noreg
		; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI2]], 16, 14 /* CC::al */, $noreg, $noreg
		; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post [[COPY1]], [[PHI]], 16, 1, [[MVE_VCTP8_]]
		; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI1]], 1
		; CHECK: t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
		; CHECK: t2B %bb.4, 14 /* CC::al */, $noreg
		; CHECK: bb.4.for.body.preheader:
		; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
		; CHECK: bb.2.for.cond.cleanup:
		; CHECK: tBX_RET 14 /* CC::al */, $noreg
		bb.0.entry:
		successors: %bb.1(0x50000000), %bb.2(0x30000000)
		liveins: $r0, $r1, $r2

		%2:rgpr = COPY $r2
		%1:mqpr = COPY $r1
		%0:rgpr = COPY $r0
		t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
		t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
		t2B %bb.1, 14 /* CC::al */, $noreg

		bb.1.for.body.preheader:
		MVE_MEMSETLOOPINST %0, %1, %2

		bb.2.for.cond.cleanup:
		tBX_RET 14 /* CC::al */, $noreg

		...

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Transforming memset to Tail predicated Loop
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 343657

llvm/lib/Target/ARM/ARMISelLowering.h

llvm/lib/Target/ARM/ARMISelLowering.cpp

llvm/lib/Target/ARM/ARMInstrMVE.td

llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp

llvm/lib/Target/ARM/ARMSubtarget.h

llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

llvm/test/CodeGen/Thumb2/mve-phireg.ll

llvm/test/CodeGen/Thumb2/mve-tp-loop.ll

llvm/test/CodeGen/Thumb2/mve-tp-loop.mir

This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Transforming memset to Tail predicated LoopClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 343657

llvm/lib/Target/ARM/ARMISelLowering.h

llvm/lib/Target/ARM/ARMISelLowering.cpp

llvm/lib/Target/ARM/ARMInstrMVE.td

llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp

llvm/lib/Target/ARM/ARMSubtarget.h

llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll

llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll

llvm/test/CodeGen/Thumb2/mve-phireg.ll

llvm/test/CodeGen/Thumb2/mve-tp-loop.ll

llvm/test/CodeGen/Thumb2/mve-tp-loop.mir

[ARM] Transforming memset to Tail predicated Loop
ClosedPublic