Index: include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- include/llvm/CodeGen/TargetPassConfig.h +++ include/llvm/CodeGen/TargetPassConfig.h @@ -355,6 +355,10 @@ /// instructions in SSA form. virtual void addMachineSSAOptimization(); + /// Add passes that lower variables to a + /// particular memory type. + virtual void addMachineSSALowering() {} + /// Add passes that optimize instruction level parallelism for out-of-order /// targets. These passes are run while the machine code is still in SSA /// form, so they can use MachineTraceMetrics to control their heuristics. Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -809,6 +809,11 @@ if (getOptLevel() != CodeGenOpt::None) { addMachineSSAOptimization(); } else { + // Ensure lowering to the appropriate memroy type occurs even when no + // optimizations are enabled. This type of lowering is required for + // correctness by the NVPTX backend. + addMachineSSALowering(); + // If the target requests it, assign local variables to stack slots relative // to one another and simplify frame index references where possible. addPass(&LocalStackSlotAllocationID, false); Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -24,11 +24,13 @@ NVPTXPeephole.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp + NVPTXLowerSharedFrameIndicesPass.cpp NVPTXRegisterInfo.cpp NVPTXReplaceImageHandles.cpp NVPTXSubtarget.cpp NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp + NVPTXFunctionDataSharing.cpp NVPTXUtilities.cpp NVVMIntrRange.cpp NVVMReflect.cpp Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -48,10 +48,12 @@ FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(); MachineFunctionPass *createNVPTXPrologEpilogPass(); +MachineFunctionPass *createNVPTXLowerSharedFrameIndicesPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); +FunctionPass *createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM); MachineFunctionPass *createNVPTXPeephole(); Target &getTheNVPTXTarget32(); Index: lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -92,6 +92,7 @@ using namespace llvm; #define DEPOTNAME "__local_depot" +#define SHARED_DEPOTNAME "__shared_depot" static cl::opt EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden, @@ -1721,6 +1722,7 @@ // virtual register number starting from 1 with that class. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); //unsigned numRegClasses = TRI->getNumRegClasses(); + bool IsKernelFunction = isKernelFunction(*MF.getFunction()); // Emit the Fake Stack Object const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1728,12 +1730,24 @@ if (NumBytes) { O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME << getFunctionNumber() << "[" << NumBytes << "];\n"; + if (IsKernelFunction) { + O << "\t.shared .align " << MFI.getMaxAlignment() << " .b8 \t" << SHARED_DEPOTNAME + << getFunctionNumber() << "[" << NumBytes << "];\n"; + } if (static_cast(MF.getTarget()).is64Bit()) { O << "\t.reg .b64 \t%SP;\n"; O << "\t.reg .b64 \t%SPL;\n"; + if (IsKernelFunction){ + O << "\t.reg .b64 \t%SHSP;\n"; + O << "\t.reg .b64 \t%SHSPL;\n"; + } } else { O << "\t.reg .b32 \t%SP;\n"; O << "\t.reg .b32 \t%SPL;\n"; + if (IsKernelFunction){ + O << "\t.reg .b32 \t%SHSP;\n"; + O << "\t.reg .b32 \t%SHSPL;\n"; + } } } @@ -2362,6 +2376,8 @@ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { if (MO.getReg() == NVPTX::VRDepot) O << DEPOTNAME << getFunctionNumber(); + else if (MO.getReg() == NVPTX::VRSharedDepot) + O << SHARED_DEPOTNAME << getFunctionNumber(); else O << NVPTXInstPrinter::getRegisterName(MO.getReg()); } else { Index: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp =================================================================== --- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -37,6 +37,8 @@ /// \brief Clean up the name to remove symbols invalid in PTX. std::string cleanUpName(StringRef Name); + /// Set a clean name, ensuring collisions are avoided. + void generateCleanName(Value &V); }; } @@ -50,20 +52,31 @@ "Assign valid PTX names to globals", false, false) bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { - for (GlobalVariable &GV : M.globals()) { - // We are only allowed to rename local symbols. - if (GV.hasLocalLinkage()) { - // setName doesn't do extra work if the name does not change. - // Note: this does not create collisions - if setName is asked to set the - // name to something that already exists, it adds a proper postfix to - // avoid collisions. - GV.setName(cleanUpName(GV.getName())); - } - } + // We are only allowed to rename local symbols. + for (GlobalVariable &GV : M.globals()) + if (GV.hasLocalLinkage()) + generateCleanName(GV); + + // Clean function symbols. + for (auto &FN : M.functions()) + if (FN.hasLocalLinkage()) + generateCleanName(FN); return true; } +void NVPTXAssignValidGlobalNames::generateCleanName(Value &V) { + std::string ValidName; + do { + ValidName = cleanUpName(V.getName()); + // setName doesn't do extra work if the name does not change. + // Collisions are avoided by adding a suffix (which may yet be unclean in + // PTX). + V.setName(ValidName); + // If there are no collisions return, otherwise clean up the new name. + } while (!V.getName().equals(ValidName)); +} + std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) { std::string ValidName; raw_string_ostream ValidNameStream(ValidName); Index: lib/Target/NVPTX/NVPTXFrameLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -16,6 +16,7 @@ #include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "NVPTXUtilities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -61,6 +62,27 @@ BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode), NVPTX::VRFrameLocal) .addImm(MF.getFunctionNumber()); + + // Only emit a shared depot for the main kernel function. + // The other device functions need to get a handle on this shared depot + // by interacting with the runtime. + if (isKernelFunction(*MF.getFunction())){ + // Emits + // mov %SHSPL, %shared_depot; + // cvta.shared %SHSP, %SHSPL; + // For the time being just emit it even if it's not used. + unsigned CvtaSharedOpcode = + Is64Bit ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes; + unsigned MovSharedDepotOpcode = + Is64Bit ? NVPTX::MOV_SHARED_DEPOT_ADDR_64 : NVPTX::MOV_SHARED_DEPOT_ADDR; + MI = BuildMI(MBB, MI, dl, + MF.getSubtarget().getInstrInfo()->get(CvtaSharedOpcode), + NVPTX::VRShared) + .addReg(NVPTX::VRFrameShared); + BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovSharedDepotOpcode), + NVPTX::VRFrameShared) + .addImm(MF.getFunctionNumber()); + } } } Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.h =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXFunctionDataSharing.h @@ -0,0 +1,37 @@ +//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H +#define LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H + +namespace llvm { + +class NVPTXFunctionDataSharing : public FunctionPass { + bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) {} + StringRef getPassName() const override { + return "Function level data sharing pass."; + } + +private: + const NVPTXTargetMachine *TM; +}; +} // End llvm namespace + +#endif \ No newline at end of file Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp @@ -0,0 +1,107 @@ +//===-- FunctionDataSharing.cpp - Mark pointers as shared -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "NVPTXTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace llvm { +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); +} + +namespace { +class NVPTXFunctionDataSharing : public FunctionPass { + bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) {} + StringRef getPassName() const override { + return "Function level data sharing pass."; + } + +private: + const NVPTXTargetMachine *TM; +}; +} // namespace + +char NVPTXFunctionDataSharing::ID = 1; + +INITIALIZE_PASS(NVPTXFunctionDataSharing, "nvptx-function-data-sharing", + "Function Data Sharing (NVPTX)", false, false) + +static void markPointerAsShared(Value *Ptr) { + if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED) + return; + + // Deciding where to emit the addrspacecast pair. + // Insert right after Ptr if Ptr is an instruction. + BasicBlock::iterator InsertPt = + std::next(cast(Ptr)->getIterator()); + assert(InsertPt != InsertPt->getParent()->end() && + "We don't call this function with Ptr being a terminator."); + + auto *PtrInShared = new AddrSpaceCastInst( + Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), + ADDRESS_SPACE_SHARED), + Ptr->getName(), &*InsertPt); + // Old version + auto *PtrInGeneric = new AddrSpaceCastInst(PtrInShared, Ptr->getType(), + Ptr->getName(), &*InsertPt); + // Replace with PtrInGeneric all uses of Ptr except PtrInShared. + Ptr->replaceAllUsesWith(PtrInGeneric); + PtrInShared->setOperand(0, Ptr); +} + +// ============================================================================= +// Main function for this pass. +// ============================================================================= +bool NVPTXFunctionDataSharing::runOnKernelFunction(Function &F) { + if (TM && TM->getDrvInterface() == NVPTX::CUDA) { + for (auto &B : F) { + for (auto &I : B) { + auto *AI = dyn_cast(&I); + if (!AI) + continue; + if (AI->getType()->isPointerTy() && ptrIsStored(AI)) + markPointerAsShared(AI); + } + } + } + + return true; +} + +// Device functions only need to copy byval args into local memory. +bool NVPTXFunctionDataSharing::runOnDeviceFunction(Function &F) { + return true; +} + +bool NVPTXFunctionDataSharing::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + +FunctionPass * +llvm::createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM) { + return new NVPTXFunctionDataSharing(TM); +} Index: lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXInstrInfo.td +++ lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1583,6 +1583,10 @@ "mov.u32 \t$d, __local_depot$num;", []>; def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), "mov.u64 \t$d, __local_depot$num;", []>; + def MOV_SHARED_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), + "mov.u32 \t$d, __shared_depot$num;", []>; + def MOV_SHARED_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), + "mov.u64 \t$d, __shared_depot$num;", []>; } Index: lib/Target/NVPTX/NVPTXLowerAlloca.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -71,13 +71,34 @@ Changed = true; auto PTy = dyn_cast(allocaInst->getType()); auto ETy = PTy->getElementType(); - auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); - auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); - auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + + // In the CUDA case, this is always a local address. + // In offloading to a device using OpenMP this may be an + // address allocated in the shared memory of the device. + auto *AddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + bool PtrIsStored = ptrIsStored(allocaInst); + + // Handle shared args: currently shared args are declared as + // an alloca in LLVM-IR code generation and lowered to + // shared memory. + if (PtrIsStored) + AddrTy = PointerType::get(ETy, ADDRESS_SPACE_SHARED); + + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, AddrTy, ""); + auto *GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, ""); NewASCToLocal->insertAfter(allocaInst); NewASCToGeneric->insertAfter(NewASCToLocal); + + // If a value is shared then the additional conversions are required for + // correctness. + if (PtrIsStored){ + allocaInst->replaceAllUsesWith(NewASCToGeneric); + NewASCToLocal->setOperand(0, allocaInst); + continue; + } + for (Value::use_iterator UI = allocaInst->use_begin(), UE = allocaInst->use_end(); UI != UE; ) { @@ -93,9 +114,15 @@ continue; } auto SI = dyn_cast(AllocaUse.getUser()); - if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { - SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); - continue; + if (SI && !SI->isVolatile()){ + unsigned Idx; + if (SI->getPointerOperand() == allocaInst) + Idx = SI->getPointerOperandIndex(); + else if (SI->getValueOperand() == allocaInst) + Idx = 0; + else + continue; + SI->setOperand(Idx, NewASCToGeneric); } auto GI = dyn_cast(AllocaUse.getUser()); if (GI && GI->getPointerOperand() == allocaInst) { Index: lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp @@ -0,0 +1,285 @@ +//===-- NVPTXLowerSharedFrameIndicesPass.cpp - NVPTX lowering ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified +// to remove unneeded functionality and to handle virtual registers. This pass +// lowers the frame indices to the shared framed index wherever needed. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-lower-shared-frame-indices" + +namespace { +class NVPTXLowerSharedFrameIndicesPass : public MachineFunctionPass { +public: + static char ID; + NVPTXLowerSharedFrameIndicesPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + void calculateSharedFrameObjectOffsets(MachineFunction &Fn); +}; +} + +MachineFunctionPass *llvm::createNVPTXLowerSharedFrameIndicesPass() { + return new NVPTXLowerSharedFrameIndicesPass(); +} + +char NVPTXLowerSharedFrameIndicesPass::ID = 0; + +static bool isSharedFrame( + MachineBasicBlock::iterator II, + MachineFunction &MF) { + MachineInstr ¤tMI = *II; + + if (!currentMI.getOperand(0).isReg()) + return false;; + + bool useSharedFrame = false; + unsigned AllocRegisterNumber = currentMI.getOperand(0).getReg(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == NVPTX::cvta_to_shared_yes_64 || + MI.getOpcode() == NVPTX::cvta_to_shared_yes) { + if (AllocRegisterNumber == MI.getOperand(1).getReg()) { + useSharedFrame = true; + break; + } + } + } + } + return useSharedFrame; +} + +bool NVPTXLowerSharedFrameIndicesPass::runOnMachineFunction(MachineFunction &MF) { + bool Modified = false; + bool IsKernel = isKernelFunction(*MF.getFunction()); + + SmallVector SharedFrameIndices; + + calculateSharedFrameObjectOffsets(MF); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) + continue; + + if (i + 1 >= MI.getNumOperands()) + continue; + + if (IsKernel) { + bool IsSharedFrame = false; + int FrameIndex = MI.getOperand(i).getIndex(); + + for(int SFI : SharedFrameIndices) + if (FrameIndex == SFI) + IsSharedFrame = true; + + if (!IsSharedFrame && isSharedFrame(MI, MF)) { + SharedFrameIndices.push_back(FrameIndex); + IsSharedFrame = true; + } + + if (IsSharedFrame) { + // Change Frame index to use shared stack. + MachineFunction &MF = *MI.getParent()->getParent(); + int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) + + MI.getOperand(i + 1).getImm(); + + // Using I0 as the frame pointer + // For shared data use the appropriate virtual register: VRShared + MI.getOperand(i).ChangeToRegister(NVPTX::VRShared, false); + MI.getOperand(i + 1).ChangeToImmediate(Offset); + } + } + Modified = true; + } + } + } + + return Modified; +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI.getObjectSize(FrameIdx); + + unsigned Align = MFI.getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI.setObjectOffset(FrameIdx, Offset); + Offset += MFI.getObjectSize(FrameIdx); + } +} + +/// This function computes the offset inside the shared stack. +/// +/// TODO: For simplicity, currently, the offsets conincide with +/// the local stack frame offsets - the local and stack frame +/// offsets are the same length. +void +NVPTXLowerSharedFrameIndicesPass::calculateSharedFrameObjectOffsets( + MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo &MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI.getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // NOTE: We do not have a call stack + + unsigned MaxAlign = MFI.getMaxAlignment(); + + // No scavenger + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI.getUseLocalStackAllocationBlock()) { + unsigned Align = MFI.getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) { + std::pair Entry = MFI.getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI.setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI.getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // No stack protector + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { + if (MFI.isObjectPreAllocated(i) && + MFI.getUseLocalStackAllocationBlock()) + continue; + if (MFI.isDeadObjectIndex(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + } + + // No scavenger + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI.getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI.adjustsStack() || MFI.hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI.setStackSize(StackSize); +} Index: lib/Target/NVPTX/NVPTXRegisterInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.h +++ lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -45,6 +45,8 @@ unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getSharedFrameRegister(const MachineFunction &MF) const; + ManagedStringPool *getStrPool() const { return const_cast(&ManagedStrPool); } Index: lib/Target/NVPTX/NVPTXRegisterInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -130,3 +130,7 @@ unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return NVPTX::VRFrame; } + +unsigned NVPTXRegisterInfo::getSharedFrameRegister(const MachineFunction &MF) const { + return NVPTX::VRShared; +} Index: lib/Target/NVPTX/NVPTXRegisterInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.td +++ lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -25,9 +25,12 @@ // Special Registers used as stack pointer def VRFrame : NVPTXReg<"%SP">; def VRFrameLocal : NVPTXReg<"%SPL">; +def VRShared : NVPTXReg<"%SHSP">; +def VRFrameShared : NVPTXReg<"%SHSPL">; // Special Registers used as the stack def VRDepot : NVPTXReg<"%Depot">; +def VRSharedDepot : NVPTXReg<"%SharedDepot">; // We use virtual registers, but define a few physical registers here to keep // SDAG and the MachineInstr layers happy. @@ -69,5 +72,5 @@ def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>; // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. -def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, +def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, VRShared, VRFrameShared, VRSharedDepot, (sequence "ENVREG%u", 0, 31))>; Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); } // end namespace llvm @@ -72,6 +73,7 @@ initializeNVPTXAssignValidGlobalNamesPass(PR); initializeNVPTXLowerArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXFunctionDataSharingPass(PR); initializeNVPTXLowerAggrCopiesPass(PR); } @@ -148,6 +150,7 @@ bool addInstSelector() override; void addPostRegAlloc() override; void addMachineSSAOptimization() override; + void addMachineSSALowering() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -248,10 +251,15 @@ // before the address space inference passes. addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); if (getOptLevel() != CodeGenOpt::None) { + // Add address space inference passes addAddressSpaceInferencePasses(); if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addStraightLineScalarOptimizationPasses(); + } else { + // Even when no optimizations are used, we need to lower certain + // alloca instructions to the appropriate memory type for correctness. + addPass(createNVPTXFunctionDataSharingPass(&getNVPTXTargetMachine())); } // === LSR and other generic IR passes === @@ -329,6 +337,11 @@ printAndVerify("After StackSlotColoring"); } +void NVPTXPassConfig::addMachineSSALowering() { + // Lower shared frame indices. + addPass(createNVPTXLowerSharedFrameIndicesPass(), false); +} + void NVPTXPassConfig::addMachineSSAOptimization() { // Pre-ra tail duplication. if (addPass(&EarlyTailDuplicateID)) @@ -338,6 +351,11 @@ // instructions dead. addPass(&OptimizePHIsID); + // To avoid SSA optimizations on the local frame indices from treating + // shared and local frame indices the same, we will lower shared frame + // before the optimizations are applied. + addMachineSSALowering(); + // This pass merges large allocas. StackSlotColoring is a different pass // which merges spill slots. addPass(&StackColoringID); Index: lib/Target/NVPTX/NVPTXUtilities.h =================================================================== --- lib/Target/NVPTX/NVPTXUtilities.h +++ lib/Target/NVPTX/NVPTXUtilities.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -60,6 +62,8 @@ bool getAlign(const Function &, unsigned index, unsigned &); bool getAlign(const CallInst &, unsigned index, unsigned &); +bool ptrIsStored(Value *Ptr); + } #endif Index: lib/Target/NVPTX/NVPTXUtilities.cpp =================================================================== --- lib/Target/NVPTX/NVPTXUtilities.cpp +++ lib/Target/NVPTX/NVPTXUtilities.cpp @@ -28,6 +28,8 @@ namespace llvm { +#define DEBUG_TYPE "nvptx-utilities" + namespace { typedef std::map > key_val_pair_t; typedef std::map global_val_annot_t; @@ -314,4 +316,50 @@ return false; } +/// Returns true if there are any instructions storing +/// the address of this pointer. +bool ptrIsStored(Value *Ptr) { + SmallVector PointerAliases; + PointerAliases.push_back(Ptr); + + SmallVector Users; + for (const Use &U : Ptr->uses()) + Users.push_back(U.getUser()); + + for (unsigned I = 0; I < Users.size(); ++I) { + // Get pointer usage + const User *FU = Users[I]; + + // Check if Ptr or an alias to it is the destination of the store + auto SI = dyn_cast(FU); + if (SI) { + for (auto Alias: PointerAliases) + if (SI->getValueOperand() == Alias) + return true; + continue; + } + + // TODO: Can loads lead to address being taken? + // TODO: Can GEPs lead to address being taken? + + // Bitcasts increase aliases of the pointer + auto BI = dyn_cast(FU); + if (BI) { + for (const Use &U : BI->uses()) + Users.push_back(U.getUser()); + PointerAliases.push_back(BI); + continue; + } + + // TODO: + // There may be other instructions which increase the number + // of alias values ex. operations on the address of the alloca. + // The whole alloca'ed memory region needs to be shared if at + // least one of the values needs to be shared. + } + + // Address of the pointer has been stored + return false; +} + } // namespace llvm