Diff 126890

lib/Target/PowerPC/CMakeLists.txt

Show All 21 Lines	add_llvm_target(PowerPCCodeGen
PPCHazardRecognizers.cpp		PPCHazardRecognizers.cpp
PPCInstrInfo.cpp		PPCInstrInfo.cpp
PPCISelDAGToDAG.cpp		PPCISelDAGToDAG.cpp
PPCISelLowering.cpp		PPCISelLowering.cpp
PPCEarlyReturn.cpp		PPCEarlyReturn.cpp
PPCFastISel.cpp		PPCFastISel.cpp
PPCFrameLowering.cpp		PPCFrameLowering.cpp
PPCLoopPreIncPrep.cpp		PPCLoopPreIncPrep.cpp
		PPCLowerMemIntrinsics.cpp
PPCMCInstLower.cpp		PPCMCInstLower.cpp
PPCMachineFunctionInfo.cpp		PPCMachineFunctionInfo.cpp
PPCMIPeephole.cpp		PPCMIPeephole.cpp
PPCRegisterInfo.cpp		PPCRegisterInfo.cpp
PPCQPXLoadSplat.cpp		PPCQPXLoadSplat.cpp
PPCSubtarget.cpp		PPCSubtarget.cpp
PPCTargetMachine.cpp		PPCTargetMachine.cpp
PPCTargetObjectFile.cpp		PPCTargetObjectFile.cpp
Show All 15 Lines

lib/Target/PowerPC/PPC.h

	Show All 21 Lines
	#undef PPC			#undef PPC

	namespace llvm {			namespace llvm {
	class PPCTargetMachine;			class PPCTargetMachine;
	class PassRegistry;			class PassRegistry;
	class FunctionPass;			class FunctionPass;
	class MachineInstr;			class MachineInstr;
	class MachineOperand;			class MachineOperand;
				class ModulePass;
	class AsmPrinter;			class AsmPrinter;
	class MCInst;			class MCInst;
	class MCOperand;			class MCOperand;

	FunctionPass *createPPCCTRLoops();			FunctionPass *createPPCCTRLoops();
	#ifndef NDEBUG			#ifndef NDEBUG
	FunctionPass *createPPCCTRLoopsVerify();			FunctionPass *createPPCCTRLoopsVerify();
	#endif			#endif
	FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);			FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
	FunctionPass *createPPCTOCRegDepsPass();			FunctionPass *createPPCTOCRegDepsPass();
	FunctionPass *createPPCEarlyReturnPass();			FunctionPass *createPPCEarlyReturnPass();
	FunctionPass *createPPCVSXCopyPass();			FunctionPass *createPPCVSXCopyPass();
	FunctionPass *createPPCVSXFMAMutatePass();			FunctionPass *createPPCVSXFMAMutatePass();
	FunctionPass *createPPCVSXSwapRemovalPass();			FunctionPass *createPPCVSXSwapRemovalPass();
	FunctionPass *createPPCReduceCRLogicalsPass();			FunctionPass *createPPCReduceCRLogicalsPass();
	FunctionPass *createPPCMIPeepholePass();			FunctionPass *createPPCMIPeepholePass();
	FunctionPass *createPPCBranchSelectionPass();			FunctionPass *createPPCBranchSelectionPass();
	FunctionPass *createPPCBranchCoalescingPass();			FunctionPass *createPPCBranchCoalescingPass();
	FunctionPass *createPPCQPXLoadSplatPass();			FunctionPass *createPPCQPXLoadSplatPass();
	FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);			FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
	FunctionPass *createPPCTLSDynamicCallPass();			FunctionPass *createPPCTLSDynamicCallPass();
	FunctionPass *createPPCBoolRetToIntPass();			FunctionPass *createPPCBoolRetToIntPass();
	FunctionPass *createPPCExpandISELPass();			FunctionPass *createPPCExpandISELPass();

				ModulePass *createPPCLowerMemIntrinsicsPass();

	void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,			void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
	AsmPrinter &AP, bool isDarwin);			AsmPrinter &AP, bool isDarwin);
	bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,			bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
	MCOperand &OutMO, AsmPrinter &AP,			MCOperand &OutMO, AsmPrinter &AP,
	bool isDarwin);			bool isDarwin);

	void initializePPCVSXFMAMutatePass(PassRegistry&);			void initializePPCVSXFMAMutatePass(PassRegistry&);
	void initializePPCBoolRetToIntPass(PassRegistry&);			void initializePPCBoolRetToIntPass(PassRegistry&);
	void initializePPCExpandISELPass(PassRegistry &);			void initializePPCExpandISELPass(PassRegistry &);
	void initializePPCTLSDynamicCallPass(PassRegistry &);			void initializePPCTLSDynamicCallPass(PassRegistry &);
				void initializePPCLowerMemIntrinsicsPass(llvm::PassRegistry&);
	extern char &PPCVSXFMAMutateID;			extern char &PPCVSXFMAMutateID;

	namespace PPCII {			namespace PPCII {

	/// Target Operand Flag enum.			/// Target Operand Flag enum.
	enum TOF {			enum TOF {
	//===------------------------------------------------------------------===//			//===------------------------------------------------------------------===//
	// PPC Specific MachineOperand flags.			// PPC Specific MachineOperand flags.
	▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp

This file was added.

				//===-------- PPCLowerMemIntrinsics.cpp - Expand memory instinsics -------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				///
				/// An IR to IR pass that expands llvm.memcpy intrinsics into the equivalent
				/// load-store loops.
				///
				//===----------------------------------------------------------------------===//

				#include "PPC.h"
				#include "llvm/ADT/Statistic.h"
				#include "llvm/ADT/StringSwitch.h"
				#include "llvm/Analysis/TargetTransformInfo.h"
				#include "llvm/IR/Constants.h"
				#include "llvm/IR/IRBuilder.h"
				#include "llvm/IR/Instructions.h"
				#include "llvm/IR/IntrinsicInst.h"
				#include "llvm/IR/Module.h"
				#include "llvm/Pass.h"
				#include "llvm/Support/CommandLine.h"
				#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"

				#define DEBUG_TYPE "ppc-memcpy-loop-lowering"

				nemanjaiUnsubmitted Not Done Reply Inline Actions I really think the debug type and the options should reflect what this pass does. Naming something `lower-mem-intrinsics` suggests that without it we don't lower these, which isn't entirely the case. Also, no real information is gained from statements such as "extra memcpy expansions". On the other hand, saying what this thing does is useful - such as "Expand memcpy calls into loops under specified conditions". And respectively, an option such as `ppc-enable-memcpy-loops`. nemanjai: I really think the debug type and the options should reflect what this pass does. Naming…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions This makes sense to me. Initially I thought we would end up doing similar expansions for some of the other memory-intrinsics. I've only updated the debug-type and option for now but I think I will change the file-name and pass-name as well since they really aren't describing what we are currently doing. If/when we add other transformations we can update the name to reflect that. sfertile: This makes sense to me. Initially I thought we would end up doing similar expansions for some…
				// This pass will loop over all MemCpyInstrs and expand some of them into loops.
				// For known compile time sizes, calls where the size belongs to
				// [MemcpyLoopFloor, MemcpyLoopCeil] will be expanded. For unknown sizes we are
				// expanding all call sites.

				hfinkelUnsubmitted Done Reply Inline Actions Please remove "The pass is off by default and can be...". These kinds of comments should, by definition, be true only temporarily, and the risk of becoming stale is high. Plus, nearly all passes have these kinds of cl::opts, and we don't have these kinds of comments in general. Feel free to add a comment below, near the relevant cl::opt, stating that this option enables or disables the entire transformation. hfinkel: Please remove "The pass is off by default and can be...". These kinds of comments should, by…
				STATISTIC(MemCpyLoopExpansions, "Number of memcpy calls expanded into a loop.");

				nemanjaiUnsubmitted Done Reply Inline Actions I suppose the temporary comments such as the last two sentences are fine as long as you remember to remove them in the follow-up patch. nemanjai: I suppose the temporary comments such as the last two sentences are fine as long as you…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions Removed the temporary comment anyway. sfertile: Removed the temporary comment anyway.
				using namespace llvm;

				static cl::opt<bool> EnableMemcpyExpansionPass(
				"ppc-enable-memcpy-loops",
				cl::desc("Enable the PPC pass that lowers memcpy calls into loops."),
				cl::init(false), cl::Hidden);

				// Options used to tune the size range where memcpy expansions occur.
				static cl::opt<unsigned> MemcpyLoopFloor(
				"ppc-memcpy-loop-floor", cl::Hidden, cl::init(129),
				cl::desc(
				"The lower size bound of memcpy calls to get expanded into a loop"));

				static cl::opt<unsigned> MemcpyLoopCeil(
				"ppc-memcpy-loop-ceil", cl::Hidden, cl::init(256),
				cl::desc("The upper size bound of memcpy calls to get expanded in a loop"));

				namespace {
				class PPCLowerMemIntrinsics : public ModulePass {
				public:
				static char ID;

				PPCLowerMemIntrinsics() : ModulePass(ID) {}

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				AU.addRequired<TargetTransformInfoWrapperPass>();
				}

				bool runOnModule(Module &M) override;
				/// Loops over all uses of llvm.memcpy and expands the call if warranted.
				// \p MemcpyDecl is the function declaration of llvm.memcpy.
				bool expandMemcopies(Function &MemcpyDecl);

				StringRef getPassName() const override {
				return "PPC Lower memcpy into loops";
				}
				};
				} // end anonymous namespace


				// Checks whether the cpu arch is one where we want to expand
				// memcpy calls.
				static bool CPUCheck(const std::string &CpuStr) {
				nemanjaiUnsubmitted Done Reply Inline Actions Seems like in most implementation files, the convention is to put these at the bottom of the file. Having them in a fixed location in every file makes them easy to find. nemanjai: Seems like in most implementation files, the convention is to put these at the bottom of the…
				return StringSwitch<bool>(CpuStr)
				.Case("pwr8", true)
				.Case("pwr9", true)
				.Case("ppc64le", true) // The default cpu for little-endian.
				.Default(false);
				}
				nemanjaiUnsubmitted Not Done Reply Inline Actions Umm, this is kind of meaningless. Technically, PowerPC CPU's have been capable of running in both little-endian and big-endian mode for about 4-5 generations. If the pass does not check endianness, there's no real need to mention it here. nemanjai: Umm, this is kind of meaningless. Technically, PowerPC CPU's have been capable of running in…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions I added 'ppc64le' since that is the default target_cpu on little-endian. I thought that little-endian was only officially supported starting in Power8, so having a LE cpu indicated Power8 or above for the architecture. If thats not the case I can rethink this switch. sfertile: I added 'ppc64le' since that is the default target_cpu on little-endian. I thought that little…
				inouehrsUnsubmitted Not Done Reply Inline Actions Why not test `DataLayout.isLittleEndian()`? I think PPC chips has been bi-endian for a long time (e.g. ancient Windows NT for PPC ran in little-endian mode). But the LE Linux supports POWER8 and later, but BE Linux also runs on P8. inouehrs: Why not test `DataLayout.isLittleEndian()`? I think PPC chips has been bi-endian for a long…

				// Determines if we want to expand a specific memcpy call.
				static bool shouldExpandMemCpy(MemCpyInst *MC) {
				// If compiling for -O0, -Oz or -Os we don't want to expand.
				Function *ParentFunc = MC->getParent()->getParent();
				if (ParentFunc->optForSize() \|\|
				ParentFunc->hasFnAttribute(Attribute::OptimizeNone))
				return false;

				// See if the cpu arch is one we want to expand for. If there is no
				// target-cpu attibute assume we don't want to expand.
				Attribute CPUAttr = ParentFunc->getFnAttribute("target-cpu");
				if (CPUAttr.hasAttribute(Attribute::None) \|\|
				!CPUCheck(CPUAttr.getValueAsString())) {
				return false;
				}

				hfinkelUnsubmitted Not Done Reply Inline Actions You should allow this check to be overridden when the user explicitly enables the transformation. You can do this by checking `EnableMemcpyExpansionPass.getNumOccurrences() > 0 && EnableMemcpyExpansionPass`. hfinkel: You should allow this check to be overridden when the user explicitly enables the…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions This pass is off by default right now, so we have to specify this at least once to turn it on. I could add this check in the follow up patch that enables the pass by default though. sfertile: This pass is off by default right now, so we have to specify this at least once to turn it on.
				// Expand known sizes within the range [MemcpyLoopFloor, MemcpyLoopCeil].
				ConstantInt *CISize = dyn_cast<ConstantInt>(MC->getLength());
				if (CISize) {
				return CISize->getZExtValue() >= MemcpyLoopFloor &&
				CISize->getZExtValue() <= MemcpyLoopCeil;
				}

				// Otherwise expand unkown sizes ...
				return true;
				}

				bool PPCLowerMemIntrinsics::expandMemcopies(Function &F) {
				bool AnyExpanded = false;
				assert(Intrinsic::memcpy == F.getIntrinsicID() &&
				"expandMemcopies called on wrong function declaration.");
				// loop over all memcpy calls
				hfinkelUnsubmitted Not Done Reply Inline Actions I don't understand this comment. It looks like this function is just llvm::expandMemCpyAsLoop whenever TTI.useWideIRMemcpyLoopLowering() returns false. Why not just use that function? hfinkel: I don't understand this comment. It looks like this function is just llvm::expandMemCpyAsLoop…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions This is so old it took me a while to figure out what was going on here but I think I got it now. This is here essentially because: It originally took pgo data into account but that is split out into a subsequent patch making this the same as ' llvm::expandMemCpyAsLoop when TTI.useWideIRMemcpyLoopLowering() returns false as you pointed out. I didn't use ' llvm::expandMemCpyAsLoop' directly because useWideIRMemcpyLoopLowering is actually off by default for every target, even PPC(!). The reason 'useWideIRMemcpyLoopLowering ' got added in the first place was because I was hesitant to change the targets I didn't have a way of running functional testing for (amd and nvptx backends), and it was only there to give people an easy way to test it out before flipping the switch to the new implementation. It was supposed too get removed in a subsequent cleanup patch that I never implemented. Rather then changing this to use 'llvm::expandMemCpyAsLoop' I would like to leave it since the follow up patch (D32872) modifies it so it no longer matches, and I will post the cleanup patch that removes the extra TTI hook and the old byte-copy only implementation of memcpy lowering separately since it can go in independent of this. sfertile: This is so old it took me a while to figure out what was going on here but I think I got it now.
				hfinkelUnsubmitted Not Done Reply Inline Actions Okay. hfinkel: Okay.
				hfinkelUnsubmitted Done Reply Inline Actions "Will go away when the old memcpy expansion implementation does." - Please refer here to some specific function (i.e. a particular piece of code), so that we know if this comment becomes stale. hfinkel: "Will go away when the old memcpy expansion implementation does." - Please refer here to some…
				for (auto I : F.users()) {
				MemCpyInst *MC = dyn_cast<MemCpyInst>(I);
				assert(MC && "Must be a MemcpyInst!");
				if (shouldExpandMemCpy(MC)) {
				Function *ParentFunc = MC->getParent()->getParent();
				const TargetTransformInfo &TTI =
				getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
				expandMemCpyAsLoop(MC, TTI);
				MC->eraseFromParent();
				AnyExpanded = true;
				++MemCpyLoopExpansions;
				}
				}
				return AnyExpanded;
				}

				bool PPCLowerMemIntrinsics::runOnModule(Module &M) {
				if (!EnableMemcpyExpansionPass \|\| skipModule(M))
				return false;

				bool Modified = false;
				for (Function &F : M) {
				nemanjaiUnsubmitted Done Reply Inline Actions Assert message please. nemanjai: Assert message please.
				// Looking for the declaration of llvm.memcpy so we can skip
				// any definition.
				if (!F.isDeclaration())
				continue;

				switch (F.getIntrinsicID()) {
				default:
				nemanjaiUnsubmitted Done Reply Inline Actions Wouldn't we want the TTI from the caller? I know that in practice there's no difference, but I can certainly envision possibilities where that isn't really the case. nemanjai: Wouldn't we want the TTI from the caller? I know that in practice there's no difference, but I…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions You are right. The reason I moved the check to inside the loop was so that I could get the TTI from the caller, then I forgot to actually do that. sfertile: You are right. The reason I moved the check to inside the loop was so that I could get the TTI…
				break;
				jtonyUnsubmitted Done Reply Inline Actions Minor nit: Why not just use MemCpyLoopExpansions++? jtony: Minor nit: Why not just use MemCpyLoopExpansions++?
				case Intrinsic::memcpy:
				Modified = expandMemcopies(F);
				}
				}

				return Modified;
				}

				ModulePass *llvm::createPPCLowerMemIntrinsicsPass() {
				return new PPCLowerMemIntrinsics();
				}

				char PPCLowerMemIntrinsics::ID = 0;
				nemanjaiUnsubmitted Done Reply Inline Actions I don't think there's a need to split these two early exits since there's nothing between them. nemanjai: I don't think there's a need to split these two early exits since there's nothing between them.
				INITIALIZE_PASS(PPCLowerMemIntrinsics, "PPCLowerMemIntrinsics",
				"Lower mem intrinsics into loops", false, false)

lib/Target/PowerPC/PPCTargetMachine.cpp

Show First 20 Lines • Show All 96 Lines • ▼ Show 20 Lines	extern "C" void LLVMInitializePowerPCTarget() {
RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());		RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());		RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());		RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());

PassRegistry &PR = *PassRegistry::getPassRegistry();		PassRegistry &PR = *PassRegistry::getPassRegistry();
initializePPCBoolRetToIntPass(PR);		initializePPCBoolRetToIntPass(PR);
initializePPCExpandISELPass(PR);		initializePPCExpandISELPass(PR);
initializePPCTLSDynamicCallPass(PR);		initializePPCTLSDynamicCallPass(PR);
		initializePPCLowerMemIntrinsicsPass(PR);
}		}

/// Return the datalayout string of a subtarget.		/// Return the datalayout string of a subtarget.
static std::string getDataLayoutString(const Triple &T) {		static std::string getDataLayoutString(const Triple &T) {
bool is64Bit = T.getArch() == Triple::ppc64 \|\| T.getArch() == Triple::ppc64le;		bool is64Bit = T.getArch() == Triple::ppc64 \|\| T.getArch() == Triple::ppc64le;
std::string Ret;		std::string Ret;

// Most PPC* platforms are big endian, PPC64LE is little endian.		// Most PPC* platforms are big endian, PPC64LE is little endian.
▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	void PPCPassConfig::addIRPasses() {
// intrinsics.		// intrinsics.
bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&		bool UsePrefetching = TM->getTargetTriple().getVendor() == Triple::BGQ &&
getOptLevel() != CodeGenOpt::None;		getOptLevel() != CodeGenOpt::None;
if (EnablePrefetch.getNumOccurrences() > 0)		if (EnablePrefetch.getNumOccurrences() > 0)
UsePrefetching = EnablePrefetch;		UsePrefetching = EnablePrefetch;
if (UsePrefetching)		if (UsePrefetching)
addPass(createLoopDataPrefetchPass());		addPass(createLoopDataPrefetchPass());


		if (TM->getOptLevel() != CodeGenOpt::None)
		addPass(createPPCLowerMemIntrinsicsPass());

if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {		if (TM->getOptLevel() >= CodeGenOpt::Default && EnableGEPOpt) {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices		// Call SeparateConstOffsetFromGEP pass to extract constants within indices
// and lower a GEP with multiple indices to either arithmetic operations or		// and lower a GEP with multiple indices to either arithmetic operations or
// multiple GEPs with single index.		// multiple GEPs with single index.
addPass(createSeparateConstOffsetFromGEPPass(TM, true));		addPass(createSeparateConstOffsetFromGEPPass(TM, true));
// Call EarlyCSE pass to find and remove subexpressions in the lowered		// Call EarlyCSE pass to find and remove subexpressions in the lowered
// result.		// result.
addPass(createEarlyCSEPass());		addPass(createEarlyCSEPass());
// Do loop invariant code motion in case part of the lowered result is		// Do loop invariant code motion in case part of the lowered result is
// invariant.		// invariant.
addPass(createLICMPass());		addPass(createLICMPass());
}		}

TargetPassConfig::addIRPasses();		TargetPassConfig::addIRPasses();
}		}
		nemanjaiUnsubmitted Not Done Reply Inline Actions Would it be possible (or appropriate) to run the loop passes after this since we may have introduced loops? Seems like unrolling and vectorization may be beneficial (at least in some circumstances). nemanjai: Would it be possible (or appropriate) to run the loop passes after this since we may have…
		sfertileAuthorUnsubmitted Not Done Reply Inline Actions Yes, especially for the known size loop expansion. I initially tested this with the loop expansion being hand unrolled between 2-4 times and it had much better performance in my micro-benches. sfertile: Yes, especially for the known size loop expansion. I initially tested this with the loop…
		hfinkelUnsubmitted Done Reply Inline Actions I think that it would make sense to put this before the GEPOpt passes above (not after them, as it is now). Did you try that? If you do, then you'll get index optimizations, LICM, etc. after the expansion, which I can imagine might be useful. hfinkel: I think that it would make sense to put this before the GEPOpt passes above (not after them, as…

bool PPCPassConfig::addPreISel() {		bool PPCPassConfig::addPreISel() {
if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)		if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));		addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));

if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)		if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
addPass(createPPCCTRLoops());		addPass(createPPCCTRLoops());

▲ Show 20 Lines • Show All 93 Lines • Show Last 20 Lines

lib/Target/PowerPC/PPCTargetTransformInfo.h

Show First 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	public:
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);		int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,		int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);		unsigned AddressSpace, const Instruction *I = nullptr);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,		int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor,		unsigned Factor,
ArrayRef<unsigned> Indices,		ArrayRef<unsigned> Indices,
unsigned Alignment,		unsigned Alignment,
unsigned AddressSpace);		unsigned AddressSpace);
		Type getMemcpyLoopLoweringType(LLVMContext &Context, Value Length,
		unsigned SrcAlign, unsigned DestAlign) const;
		void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
		LLVMContext &Context,
		unsigned RemainingBytes,
		unsigned SrcAlign,
		unsigned DestAlign) const;
		inouehrsUnsubmitted Not Done Reply Inline Actions Currently, there is no use of these two methods. Do you add them for the subsequent patches? inouehrs: Currently, there is no use of these two methods. Do you add them for the subsequent patches?
/// @}		/// @}
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

Show First 20 Lines • Show All 474 Lines • ▼ Show 20 Lines	int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
// instruction). For each result vector, we need one shuffle per incoming		// instruction). For each result vector, we need one shuffle per incoming
// vector (except that the first shuffle can take two incoming vectors		// vector (except that the first shuffle can take two incoming vectors
// because it does not need to take itself).		// because it does not need to take itself).
Cost += Factor*(LT.first-1);		Cost += Factor*(LT.first-1);

return Cost;		return Cost;
}		}

		Type PPCTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value Length,
		unsigned SrcAlign,
		unsigned DestAlign) const {
		return Type::getInt64Ty(Context);
		}

		/// Decomposes a copy operation with size \p RemainingBytes into the individual
		/// operands.
		void PPCTTIImpl::getMemcpyLoopResidualLoweringType(
		SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
		unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
		// Types to use in copy operations.
		IntegerType *CopyTypes[] = {
		Type::getInt64Ty(Context), Type::getInt32Ty(Context),
		Type::getInt16Ty(Context), Type::getInt8Ty(Context)};

		// Deconstructs the remaining bytes into individual operands.
		for (auto OpTy : CopyTypes) {
		unsigned OpSize = OpTy->getBitWidth() / 8;
		// Loops just in case the remaining bytes are greater or equal to
		nemanjaiUnsubmitted Done Reply Inline Actions Nit: full sentences for comments. And s/incase/in case. nemanjai: Nit: full sentences for comments. And s/incase/in case.
		// twice the largest copy operand type.
		while (RemainingBytes >= OpSize) {
		RemainingBytes -= OpSize;
		OpsOut.push_back(OpTy);
		}
		}
		}

test/CodeGen/PowerPC/memcpy-loop-expansion.ll

This file was added.

				; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \
				; RUN: -mtriple=powerpc64le-unknown-linux-gnu -ppc-memcpy-loop-floor=0 \
				; RUN: -mcpu=pwr8 %s\| FileCheck -check-prefix=OPT %s
				; RUN: opt -S -PPCLowerMemIntrinsics -ppc-enable-memcpy-loops=true \
				; RUN: -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 %s \| \
				; RUN: FileCheck %s --check-prefix PWR7
				; RUN: llc < %s -ppc-enable-memcpy-loops=true \
				; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -O0 \| \
				; RUN: FileCheck %s --check-prefix OPTNONE

				declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #0

				; Check that memcpy calls with a known zero length are removed.
				define i8* @memcpy_zero_size(i8* %dst, i8* %src) {
				nemanjaiUnsubmitted Not Done Reply Inline Actions In general, I think all the functions should include checks for `PWR7`, `OPTSMALL` and `OPTNONE` (providing that the `-Os/-Oz` can be specified on the command line for the tool you're invoking). nemanjai: In general, I think all the functions should include checks for `PWR7`, `OPTSMALL` and…
				sfertileAuthorUnsubmitted Not Done Reply Inline Actions I've changed the optsmall checks to be run as part of the OPT test since they didn't really have to be run on there own. The reason the no-opt test is run with llc is because opt didn't respect -O0 when invoked like this. I'll expand the PWR7 checks to the other functions as well. sfertile: I've changed the optsmall checks to be run as part of the OPT test since they didn't really…
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 0, i32 1, i1 false)
				ret i8* %dst

				; OPT-LABEL: @memcpy_zero_size
				; OPT-NEXT: entry:
				; OPT-NEXT: ret i8* %dst
				}

				; Check that a memcpy call with a known size smaller then the loop operand
				; type is handled properly.
				define i8* @memcpy_small_size(i8* %dst, i8* %src) {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 7, i32 1, i1 false)
				ret i8* %dst

				; OPT-LABEL: @memcpy_small_size
				; OPT-NEXT: entry:
				; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i8* %src to i32*
				; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 0
				; OPT-NEXT: [[Load:%[0-9]+]] = load i32, i32* [[SrcGep]]
				; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i8* %dst to i32*
				; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 0
				; OPT-NEXT: store i32 [[Load]], i32* [[DstGep]]
				; OPT-NEXT: [[SrcAsi16:%[0-9]+]] = bitcast i8* %src to i16*
				; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[SrcAsi16]], i64 2
				; OPT-NEXT: [[Load2:%[0-9]+]] = load i16, i16* [[SrcGep2]]
				; OPT-NEXT: [[DstAsi16:%[0-9]+]] = bitcast i8* %dst to i16*
				; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i16, i16* [[DstAsi16]], i64 2
				; OPT-NEXT: store i16 [[Load2]], i16* [[DstGep2]]
				; OPT-NEXT: [[SrcGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 6
				; OPT-NEXT: [[Load3:%[0-9]+]] = load i8, i8* [[SrcGep3]]
				; OPT-NEXT: [[DstGep3:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 6
				; OPT-NEXT: store i8 [[Load3]], i8* [[DstGep3]]
				; OPT-NEXT: ret i8* %dst
				}

				; Check the expansion of a memcpy call with compile-time size.
				define i8* @memcpy_known_size(i8* %dst, i8* %src) {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 100, i32 1, i1 false)
				ret i8* %dst
				; OPT-LABEL: @memcpy_known_size
				; OPT: entry:
				; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
				; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
				; OPT-NEXT: br label %load-store-loop

				; OPT: load-store-loop:
				; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
				; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
				; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
				; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
				; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]]
				; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1
				; OPT-NEXT: [[CMP:%[0-9]+]] = icmp ult i64 [[IndexInc]], 12
				; OPT-NEXT: br i1 [[CMP]], label %load-store-loop, label %memcpy-split

				; OPT: memcpy-split:
				; OPT-NEXT: [[SrcAsi32:%[0-9]+]] = bitcast i64* [[SrcCast]] to i32*
				; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[SrcAsi32]], i64 24
				; OPT-NEXT: [[Load2:%[0-9]+]] = load i32, i32* [[SrcGep2]]
				; OPT-NEXT: [[DstAsi32:%[0-9]+]] = bitcast i64* [[DstCast]] to i32*
				; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i32, i32* [[DstAsi32]], i64 24
				; OPT-NEXT: store i32 [[Load2]], i32* [[DstGep2]]
				; OPT-NEXT: ret i8* %dst
				}


				; Check the expansion of a memcpy whose size argument is not a compile time
				; constant.
				define i8* @memcpy_unkown_size(i8* %dst, i8* %src, i64 %len) {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				ret i8* %dst

				; OPT-LABEL: @memcpy_unkown_size
				; OPT: entry:
				; OPT-NEXT: [[SrcCast:%[0-9]+]] = bitcast i8* %src to i64*
				; OPT-NEXT: [[DstCast:%[0-9]+]] = bitcast i8* %dst to i64*
				; OPT-NEXT: [[LoopCount:%[0-9]+]] = udiv i64 %len, 8
				; OPT-NEXT: [[ResBytes:%[0-9]+]] = urem i64 %len, 8
				; OPT-NEXT: [[BytesCopied:%[0-9]+]] = sub i64 %len, [[ResBytes]]
				; OPT-NEXT: [[Cmp:%[0-9]+]] = icmp ne i64 [[LoopCount]], 0
				; OPT-NEXT: br i1 [[Cmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header

				; OPT: post-loop-memcpy-expansion:
				; OPT-NEXT: ret i8* %dst

				; OPT: loop-memcpy-expansion:
				; OPT-NEXT: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
				; OPT-NEXT: [[SrcGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[SrcCast]], i64 %loop-index
				; OPT-NEXT: [[Load:%[0-9]+]] = load i64, i64* [[SrcGep]]
				; OPT-NEXT: [[DstGep:%[0-9]+]] = getelementptr inbounds i64, i64* [[DstCast]], i64 %loop-index
				; OPT-NEXT: store i64 [[Load]], i64* [[DstGep]]
				; OPT-NEXT: [[IndexInc]] = add i64 %loop-index, 1
				; OPT-NEXT: [[LoopCmp:%[0-9]+]] = icmp ult i64 [[IndexInc]], [[LoopCount]]
				; OPT-NEXT: br i1 [[LoopCmp]], label %loop-memcpy-expansion, label %loop-memcpy-residual-header

				; OPT: loop-memcpy-residual:
				; OPT-NEXT: %residual-loop-index = phi i64 [ 0, %loop-memcpy-residual-header ], [ [[ResIndexInc:%[0-9]+]], %loop-memcpy-residual ]
				; OPT-NEXT: [[SrcAsi8:%[0-9]+]] = bitcast i64* [[SrcCast]] to i8*
				; OPT-NEXT: [[DstAsi8:%[0-9]+]] = bitcast i64* [[DstCast]] to i8*
				; OPT-NEXT: [[ResIndex:%[0-9]+]] = add i64 [[BytesCopied]], %residual-loop-index
				; OPT-NEXT: [[SrcGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[SrcAsi8]], i64 [[ResIndex]]
				; OPT-NEXT: [[Load2:%[0-9]+]] = load i8, i8* [[SrcGep2]]
				; OPT-NEXT: [[DstGep2:%[0-9]+]] = getelementptr inbounds i8, i8* [[DstAsi8]], i64 [[ResIndex]]
				; OPT-NEXT: store i8 [[Load2]], i8* [[DstGep2]]
				; OPT-NEXT: [[ResIndexInc]] = add i64 %residual-loop-index, 1
				; OPT-NEXT: [[RCmp:%[0-9]+]] = icmp ult i64 [[ResIndexInc]], [[ResBytes]]
				; OPT-NEXT: br i1 [[RCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion

				; OPT: loop-memcpy-residual-header:
				; OPT-NEXT: [[RHCmp:%[0-9]+]] = icmp ne i64 [[ResBytes]], 0
				; OPT-NEXT: br i1 [[RHCmp]], label %loop-memcpy-residual, label %post-loop-memcpy-expansion
				}

				; Ensure the pass doens't expand memcpy calls when compiling a function with an
				; unspported target_cpu attribute.
				define i8* @memcpy_power7(i8* %dst, i8* %src, i64 %len) #1 {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				ret i8* %dst
				; PWR7-LABEL: @memcpy_power7
				; PWR7: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				}

				; Ensure the pass doens't expand calls in a function compiled for size.
				nemanjaiUnsubmitted Not Done Reply Inline Actions I think this should be ALL shouldn't it? You have the CPU=pwr7 attribute on the function, so regardless of the invocation, we shouldn't optimize. Same goes for `memcpy_opt_small`. nemanjai: I think this should be ALL shouldn't it? You have the CPU=pwr7 attribute on the function, so…
				define i8* @memcpy_opt_small(i8* %dst, i8* %src, i64 %len) #2 {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				ret i8* %dst
				; OPT-LABEL: @memcpy_opt_small
				; OPT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				}

				; Ensure the pass doesn't expand calls on functions not compiled with
				; optimizations.
				define i8* @memcpy_opt_none(i8* %dst, i8* %src, i64 %len) {
				entry:
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %len, i32 1, i1 false)
				ret i8* %dst
				; OPTNONE-LABEL: @memcpy_opt_none
				; OPTNONE: bl memcpy
				}

				attributes #0 = { argmemonly nounwind }
				attributes #1 = { "target-cpu"="pwr7" }
				attributes #2 = { "target-cpu"="pwr8" optsize }
				hfinkelUnsubmitted Done Reply Inline Actions The code checks for pwr8, pwr9, etc. You should use that form here, I imagine, for the target cpu (I believe the short forms are indeed what the backend recognizes). hfinkel: The code checks for pwr8, pwr9, etc. You should use that form here, I imagine, for the target…

This is an archive of the discontinued LLVM Phabricator instance.

[PowerPC] Add pass to expand extra memcpy calls
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 126890

lib/Target/PowerPC/CMakeLists.txt

lib/Target/PowerPC/PPC.h

lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp

lib/Target/PowerPC/PPCTargetMachine.cpp

lib/Target/PowerPC/PPCTargetTransformInfo.h

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

test/CodeGen/PowerPC/memcpy-loop-expansion.ll

This is an archive of the discontinued LLVM Phabricator instance.

[PowerPC] Add pass to expand extra memcpy callsAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 126890

lib/Target/PowerPC/CMakeLists.txt

lib/Target/PowerPC/PPC.h

lib/Target/PowerPC/PPCLowerMemIntrinsics.cpp

lib/Target/PowerPC/PPCTargetMachine.cpp

lib/Target/PowerPC/PPCTargetTransformInfo.h

lib/Target/PowerPC/PPCTargetTransformInfo.cpp

test/CodeGen/PowerPC/memcpy-loop-expansion.ll

[PowerPC] Add pass to expand extra memcpy calls
AbandonedPublic