Index: include/llvm/CodeGen/MachineFrameInfo.h =================================================================== --- include/llvm/CodeGen/MachineFrameInfo.h +++ include/llvm/CodeGen/MachineFrameInfo.h @@ -570,6 +570,13 @@ return Objects[ObjectIdx+NumFixedObjects].isImmutable; } + /// setIsImmutableObjectIndex - Marks the immutability of an object. + void setIsImmutableObjectIndex(int ObjectIdx, bool Immutable) { + assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && + "Invalid Object Idx!"); + Objects[ObjectIdx+NumFixedObjects].isImmutable = Immutable; + } + /// Returns true if the specified index corresponds to a spill slot. bool isSpillSlotObjectIndex(int ObjectIdx) const { assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() && Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -318,6 +318,10 @@ /// ExpandISelPseudos - This pass expands pseudo-instructions. extern char &ExpandISelPseudosID; + /// ArgumentCopyElision - This pass elides copies from argument stack memory + /// to local allocas. + FunctionPass *createArgumentCopyElisionPass(); + /// createExecutionDependencyFixPass - This pass fixes execution time /// problems with dependent instructions, such as switching execution /// domains to match. Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -68,6 +68,7 @@ void initializeAlignmentFromAssumptionsPass(PassRegistry&); void initializeAlwaysInlinerLegacyPassPass(PassRegistry&); void initializeArgPromotionPass(PassRegistry&); +void initializeArgumentCopyElisionPass(PassRegistry&); void initializeAssumptionCacheTrackerPass(PassRegistry &); void initializeAtomicExpandPass(PassRegistry&); void initializeBBVectorizePass(PassRegistry&); Index: lib/CodeGen/ArgumentCopyElision.cpp =================================================================== --- /dev/null +++ lib/CodeGen/ArgumentCopyElision.cpp @@ -0,0 +1,238 @@ +//===-- llvm/CodeGen/ArgumentCopyElision.cpp --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Eliminate copies from argument stack memory to local variable stack slots. +// This can run at -O0 to simplify the generated machine code. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "arg-copy-elision" + +static cl::opt + EnableArgCopyElim("enable-arg-copy-elim", cl::Hidden, cl::init(false), + cl::desc("enable stack argument copy elimination")); + +namespace { +class ArgumentCopyElision : public MachineFunctionPass { + const TargetInstrInfo *TII; + MachineFrameInfo *MFI; + const MachineRegisterInfo *MRI; + +public: + static char ID; // Pass identification, replacement for typeid + ArgumentCopyElision() : MachineFunctionPass(ID) {} + +private: + bool isStoreOfArgumentLoad(MachineInstr &MI, unsigned &SrcReg, int &ArgIndex, + int &CopyIndex); + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // end anonymous namespace + +char ArgumentCopyElision::ID = 0; +INITIALIZE_PASS(ArgumentCopyElision, "arg-copy-elision", + "Elide copies of arguments passed on the stack", false, false) + +bool ArgumentCopyElision::isStoreOfArgumentLoad(MachineInstr &MI, + unsigned &SrcReg, int &ArgIndex, + int &CopyIndex) { + SrcReg = TII->isStoreToStackSlot(MI, CopyIndex); + if (SrcReg == 0) + return false; + + // Check that this store initializes the entire stack object. + int64_t StoreSize = (*MI.memoperands_begin())->getSize(); + if (StoreSize != MFI->getObjectSize(CopyIndex)) + return false; + + // Don't elide copies from one fixed stack slot to another. + if (MFI->isFixedObjectIndex(CopyIndex)) + return false; + + // Check if the source register is an argument load. + assert(MRI->hasOneDef(SrcReg) && "should be in SSA"); + MachineInstr &SrcLoad = *MRI->def_instr_begin(SrcReg); + + // FIXME: Ask the target if this is a register with a home location, like on + // Win64. + + // Check if this is a load from a fixed stack slot. + if (!TII->isLoadFromStackSlot(SrcLoad, ArgIndex) || + !MFI->isFixedObjectIndex(ArgIndex)) + return false; + + // Look for immutable stack objects, which indicate a non-byval argument slot. + if (!MFI->isImmutableObjectIndex(ArgIndex)) { + DEBUG(dbgs() << " rejecting elision candidate due to mutable argument " + << MachineOperand::CreateFI(ArgIndex) << '\n';); + return false; + } + + return true; +} + +/// State of a stack object during dataflow over the entry block. Each stack +/// object starts out unknown. If we see a use of the frame index in a store or +/// memcpy that fully initializes the stack object, transition to SOS_Argument. +/// Any other use of the frame index means we stop matching that stack object. + +struct ElisionCandidate { + enum State { + Unknown = INT_MAX, + Clobbered = INT_MAX - 1, + }; + + ElisionCandidate() : ArgIndexOrState(Unknown) {} + ElisionCandidate(int ArgIndex) : ArgIndexOrState(ArgIndex) {} + + static ElisionCandidate clobbered() { return ElisionCandidate(Clobbered); } + + /// Either a stack object index, or two invalid sentinel values indicating + /// that this stack object was not initialized from an argument in memory. + int ArgIndexOrState = Unknown; + + bool isUnknown() const { return ArgIndexOrState == Unknown; }; + bool isClobbered() const { return ArgIndexOrState == Clobbered; }; + bool isArgument() const { return ArgIndexOrState < Clobbered; }; +}; + +bool ArgumentCopyElision::runOnMachineFunction(MachineFunction &MF) { + if (!EnableArgCopyElim) + return false; + + const TargetSubtargetInfo &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + MFI = &MF.getFrameInfo(); + + DEBUG(dbgs() << "\nBegin ArgumentCopyElision\n"); + + // Build a map from stack object index to clobber state with a vector. Don't + // include fixed stack objects, since we never elide from one to another. + SmallVector ElisionCandidates; + ElisionCandidates.resize(MFI->getObjectIndexEnd()); + SmallVector CapturesSinceLastStore; + + // Iterate through the entry block, looking for copies to elide. + bool FoundElisions = false; + SmallVector DeadInstrs; + for (MachineInstr &MI : MF.front()) { + int ArgIndex, CopyIndex; + unsigned SrcReg; + if (isStoreOfArgumentLoad(MI, SrcReg, ArgIndex, CopyIndex)) { + if (ElisionCandidates[CopyIndex].isUnknown()) { + // Mark these frame indicies for copy elision. + DEBUG(dbgs() << " eliding argument copy from " + << MachineOperand::CreateFI(ArgIndex) << " to " + << MachineOperand::CreateFI(CopyIndex) << '\n'); + ElisionCandidates[CopyIndex] = ElisionCandidate(ArgIndex); + assert(ElisionCandidates[CopyIndex].isArgument()); + FoundElisions = true; + + // Forget that the copied object was captured. This instruction stores + // to it. + CapturesSinceLastStore.erase(std::remove(CapturesSinceLastStore.begin(), + CapturesSinceLastStore.end(), + CopyIndex), + CapturesSinceLastStore.end()); + + // Mark the copy destination as dead, as we are about to replace all + // uses of it. + MFI->RemoveStackObject(CopyIndex); + + // Mark the argument object as mutable. By eliding the copy, we allow + // the user to modify the argument memory. + MFI->setIsImmutableObjectIndex(ArgIndex, false); + + // Mark the store for deletion. + DeadInstrs.push_back(&MI); + + // Mark the load for deletion if this was the only user. + if (MRI->hasOneUse(SrcReg)) + DeadInstrs.push_back(&*MRI->def_instr_begin(SrcReg)); + } + } else { + // Conservatively mark any frame index operands as captured if they + // haven't already been initialized with an argument. + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isFI()) + continue; + int FI = MO.getIndex(); + if (!MFI->isFixedObjectIndex(FI) && ElisionCandidates[FI].isUnknown()) + CapturesSinceLastStore.push_back(FI); + } + + // If this instruction stores, mark all captures as clobbered. + if (MI.mayStore() && !CapturesSinceLastStore.empty()) { + DEBUG(dbgs() << " storing instruction clobbering all captured " + "argument frame indices:\n" + << " " << MI); + DEBUG(dbgs() << " frame indices:"); + for (int FI : CapturesSinceLastStore) { + DEBUG(dbgs() << ' ' << MachineOperand::CreateFI(FI)); + ElisionCandidates[FI] = ElisionCandidate::clobbered(); + } + DEBUG(dbgs() << '\n'); + CapturesSinceLastStore.clear(); + } + } + } + + if (!FoundElisions) { + DEBUG(dbgs() << "No argument copy elision candidates\n\n"); + return false; + } + + for (MachineInstr *MI : DeadInstrs) { + DEBUG(dbgs() << " ArgumentCopyElision: DELETING: " << *MI); + MI->eraseFromParent(); + } + + // Replace all uses of CopyIndex with ArgIndex. Assert that there are no other + // uses of ArgIndex, which should be the case, since all instruction selection + // schemes appear to load the argument into a virtual register and then split + // the live range during register allocation by rematerializing from the + // immutable stack slot. By changing the stack slot to mutable, we will force + // the register allocator to spill if it needs to. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : MI.operands()) { + // Find frame index operands that we want to copy elide and replace + // them. + if (!MO.isFI()) + continue; + int FI = MO.getIndex(); + if (MFI->isFixedObjectIndex(FI) || !ElisionCandidates[FI].isArgument()) + continue; + DEBUG(dbgs() << " replacing " << MO << " in " << MI); + MO.setIndex(ElisionCandidates[FI].ArgIndexOrState); + } + } + } + + DEBUG(dbgs() << "End ArgumentCopyElision\n"); + + return true; +} + +FunctionPass *llvm::createArgumentCopyElisionPass() { + return new ArgumentCopyElision(); +} Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -2,6 +2,7 @@ AggressiveAntiDepBreaker.cpp AllocationOrder.cpp Analysis.cpp + ArgumentCopyElision.cpp AtomicExpandPass.cpp BasicTargetTransformInfo.cpp BranchFolding.cpp Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -679,6 +679,10 @@ // Pre-ra tail duplication. addPass(&EarlyTailDuplicateID); + // Eliminate copies of address-taken arguments now that we've lowered formal + // arguments. + addPass(createArgumentCopyElisionPass()); + // Optimize PHIs before DCE: removing dead PHI cycles may make more // instructions dead. addPass(&OptimizePHIsID, false); Index: test/CodeGen/ARM/argument-copy-elision.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/argument-copy-elision.ll @@ -0,0 +1,22 @@ +; RUN: llc -enable-arg-copy-elim -O2 < %s -mtriple=arm-linux-gnueabi | FileCheck %s + +; Check that argument copy elision works on ARM. + +declare i8* @llvm.returnaddress(i32) + +declare void @capture(i32*) + +define void @basic_addrtaken(i32, i32, i32, i32, i32 %x) { +entry: + %x.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + call void @capture(i32* %x.addr) + ret void +} + +; CHECK-LABEL: basic_addrtaken: +; CHECK-NOT: ldr +; CHECK-NOT: str +; CHECK: add r0, sp, #8 +; CHECK: bl capture +; CHECK: mov pc, lr Index: test/CodeGen/X86/argument-copy-elision.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/argument-copy-elision.ll @@ -0,0 +1,63 @@ +; RUN: llc -enable-arg-copy-elim -O2 < %s -mtriple=i686-windows | FileCheck %s + +declare i8* @llvm.returnaddress(i32) + +declare void @capture(i32*) + +define void @basic_addrtaken(i32 %x) { +entry: + %x.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + call void @capture(i32* %x.addr) + ret void +} + +; The lea should be a *positive* offset from ESP, indicating that the address of +; original argument is passed to capture. + +; CHECK-LABEL: _basic_addrtaken: +; CHECK-NOT: movl{{.*}}(%esp){{.*}} +; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]] +; CHECK: pushl %[[reg]] +; CHECK: calll _capture +; CHECK: retl + +; Make sure we don't elide copies from the return address. + +define void @retaddr_taken() { +entry: + %x.addr = alloca i32, align 4 + %ra = call i8* @llvm.returnaddress(i32 0) + %ra.i32 = ptrtoint i8* %ra to i32 + store i32 %ra.i32, i32* %x.addr, align 4 + call void @capture(i32* %x.addr) + ret void +} + +; CHECK-LABEL: _retaddr_taken: +; CHECK: movl{{.*}}(%esp){{.*}} +; CHECK: pushl %{{.*}} +; CHECK: calll _capture +; CHECK: retl + +; Make sure we look at the size of the stores. +; If the stores were reversed we could try to be more clever here. + +define void @size_mismatch(i32 %x, i32 %y) { +entry: + %aggr = alloca {i32, i32}, align 4 + %x.addr = getelementptr {i32,i32}, {i32,i32}* %aggr, i32 0, i32 0 + store i32 %x, i32* %x.addr, align 4 + %y.addr = getelementptr {i32,i32}, {i32,i32}* %aggr, i32 0, i32 1 + store i32 %y, i32* %y.addr, align 4 + call void @capture(i32* %x.addr) + ret void +} + +; CHECK-LABEL: _size_mismatch: +; CHECK: subl $8, %esp +; CHECK: movl{{.*}}(%esp){{.*}} +; CHECK: pushl %{{.*}} +; CHECK: calll _capture +; CHECK: retl +