Index: include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- include/llvm/CodeGen/TargetPassConfig.h +++ include/llvm/CodeGen/TargetPassConfig.h @@ -355,6 +355,10 @@ /// instructions in SSA form. virtual void addMachineSSAOptimization(); + /// Add passes that lower variables to a + /// particular memory type. + virtual void addMachineSSALowering() {} + /// Add passes that optimize instruction level parallelism for out-of-order /// targets. These passes are run while the machine code is still in SSA /// form, so they can use MachineTraceMetrics to control their heuristics. Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -816,6 +816,11 @@ if (getOptLevel() != CodeGenOpt::None) { addMachineSSAOptimization(); } else { + // Ensure lowering to the appropriate memroy type occurs even when no + // optimizations are enabled. This type of lowering is required for + // correctness by the NVPTX backend. + addMachineSSALowering(); + // If the target requests it, assign local variables to stack slots relative // to one another and simplify frame index references where possible. addPass(&LocalStackSlotAllocationID, false); Index: lib/Target/NVPTX/CMakeLists.txt =================================================================== --- lib/Target/NVPTX/CMakeLists.txt +++ lib/Target/NVPTX/CMakeLists.txt @@ -24,11 +24,13 @@ NVPTXPeephole.cpp NVPTXMCExpr.cpp NVPTXPrologEpilogPass.cpp + NVPTXLowerSharedFrameIndicesPass.cpp NVPTXRegisterInfo.cpp NVPTXReplaceImageHandles.cpp NVPTXSubtarget.cpp NVPTXTargetMachine.cpp NVPTXTargetTransformInfo.cpp + NVPTXFunctionDataSharing.cpp NVPTXUtilities.cpp NVVMIntrRange.cpp NVVMReflect.cpp Index: lib/Target/NVPTX/NVPTX.h =================================================================== --- lib/Target/NVPTX/NVPTX.h +++ lib/Target/NVPTX/NVPTX.h @@ -48,10 +48,12 @@ FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); FunctionPass *createNVVMReflectPass(); MachineFunctionPass *createNVPTXPrologEpilogPass(); +MachineFunctionPass *createNVPTXLowerSharedFrameIndicesPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM); BasicBlockPass *createNVPTXLowerAllocaPass(); +FunctionPass *createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM); MachineFunctionPass *createNVPTXPeephole(); Target &getTheNVPTXTarget32(); Index: lib/Target/NVPTX/NVPTXAsmPrinter.cpp =================================================================== --- lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -92,6 +92,7 @@ using namespace llvm; #define DEPOTNAME "__local_depot" +#define SHARED_DEPOTNAME "__shared_depot" static cl::opt EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden, @@ -1721,6 +1722,10 @@ // virtual register number starting from 1 with that class. const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); //unsigned numRegClasses = TRI->getNumRegClasses(); + bool IsKernelFunction = isKernelFunction(MF.getFunction()); + + bool GenerateSharedDepot = + MF.getFunction().hasFnAttribute("has-nvptx-shared-depot"); // Emit the Fake Stack Object const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1728,12 +1733,25 @@ if (NumBytes) { O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME << getFunctionNumber() << "[" << NumBytes << "];\n"; + if (IsKernelFunction && GenerateSharedDepot) { + O << "\t.shared .align " << MFI.getMaxAlignment() + << " .b8 \t" << SHARED_DEPOTNAME << getFunctionNumber() + << "[" << NumBytes << "];\n"; + } if (static_cast(MF.getTarget()).is64Bit()) { O << "\t.reg .b64 \t%SP;\n"; O << "\t.reg .b64 \t%SPL;\n"; + if (IsKernelFunction && GenerateSharedDepot) { + O << "\t.reg .b64 \t%SPS;\n"; + O << "\t.reg .b64 \t%SPSH;\n"; + } } else { O << "\t.reg .b32 \t%SP;\n"; O << "\t.reg .b32 \t%SPL;\n"; + if (IsKernelFunction && GenerateSharedDepot) { + O << "\t.reg .b32 \t%SPS;\n"; + O << "\t.reg .b32 \t%SPSH;\n"; + } } } @@ -2358,6 +2376,8 @@ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { if (MO.getReg() == NVPTX::VRDepot) O << DEPOTNAME << getFunctionNumber(); + else if (MO.getReg() == NVPTX::VRSharedDepot) + O << SHARED_DEPOTNAME << getFunctionNumber(); else O << NVPTXInstPrinter::getRegisterName(MO.getReg()); } else { Index: lib/Target/NVPTX/NVPTXFrameLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -16,6 +16,7 @@ #include "NVPTXRegisterInfo.h" #include "NVPTXSubtarget.h" #include "NVPTXTargetMachine.h" +#include "NVPTXUtilities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -61,6 +62,37 @@ BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode), NVPTX::VRFrameLocal) .addImm(MF.getFunctionNumber()); + + bool SharedStackPointerInit = + MF.getFunction().hasFnAttribute("has-nvptx-shared-depot"); + + // Only emit a shared depot for the main kernel function. + // The other device functions need to get a handle on this shared depot + // by interacting with a runtime library. + // + // Clang can trigger this at any point via the has-nvptx-shared-depot function + // attribute. + // + // Currently this situation arises as a consequence of OpenMP semantics. + // The interaction in that case invovles the OpenMP runtime. + if (isKernelFunction(MF.getFunction()) && SharedStackPointerInit) { + // Emits + // mov %SHSPL, %shared_depot; + // cvta.shared %SHSP, %SHSPL; + // For the time being just emit it even if it's not used. + unsigned CvtaSharedOpcode = + Is64Bit ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes; + unsigned MovSharedDepotOpcode = + Is64Bit ? NVPTX::MOV_SHARED_DEPOT_ADDR_64 : NVPTX::MOV_SHARED_DEPOT_ADDR; + MI = BuildMI(MBB, MI, dl, + MF.getSubtarget().getInstrInfo()->get(CvtaSharedOpcode), + NVPTX::VRShared) + .addReg(NVPTX::VRFrameShared); + BuildMI(MBB, MI, dl, + MF.getSubtarget().getInstrInfo()->get(MovSharedDepotOpcode), + NVPTX::VRFrameShared) + .addImm(MF.getFunctionNumber()); + } } } Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.h =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXFunctionDataSharing.h @@ -0,0 +1,37 @@ +//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H +#define LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H + +namespace llvm { + +class NVPTXFunctionDataSharing : public FunctionPass { + bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) {} + StringRef getPassName() const override { + return "Function level data sharing pass."; + } + +private: + const NVPTXTargetMachine *TM; +}; +} // End llvm namespace + +#endif \ No newline at end of file Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp @@ -0,0 +1,127 @@ +//===-- FunctionDataSharing.cpp - Mark pointers as shared -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// For all alloca instructions, add a pair of cast to shared address for +// each of them. For example, +// +// %A = alloca i32 +// store i32 0, i32* %A ; emits st.u32 +// +// will be transformed to +// +// %A = alloca i32 +// %Local = addrspacecast i32* %A to i32 addrspace(3)* +// %Shared = addrspacecast i32 addrspace(3)* %A to i32* +// store i32 0, i32 addrspace(5)* %Shared ; emits st.shared.u32 +// +// And we will rely on NVPTXInferAddressSpaces to combine the last two +// instructions. +// +// This pass is invoked for -O0 only. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "NVPTXTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace llvm { +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); +} + +namespace { +class NVPTXFunctionDataSharing : public FunctionPass { + bool runOnFunction(Function &F) override; + bool runOnKernelFunction(Function &F); + bool runOnDeviceFunction(Function &F); + +public: + static char ID; // Pass identification, replacement for typeid + NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr) + : FunctionPass(ID) {} + StringRef getPassName() const override { + return "Function level data sharing pass."; + } +}; +} // namespace + +char NVPTXFunctionDataSharing::ID = 1; + +INITIALIZE_PASS(NVPTXFunctionDataSharing, "nvptx-function-data-sharing", + "Function Data Sharing (NVPTX)", false, false) + +static void markPointerAsShared(Value *Ptr) { + if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED) + return; + + // Deciding where to emit the addrspacecast pair. + // Insert right after Ptr if Ptr is an instruction. + BasicBlock::iterator InsertPt = + std::next(cast(Ptr)->getIterator()); + assert(InsertPt != InsertPt->getParent()->end() && + "We don't call this function with Ptr being a terminator."); + + auto *PtrInShared = new AddrSpaceCastInst( + Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), + ADDRESS_SPACE_SHARED), + Ptr->getName(), &*InsertPt); + // Old version + auto *PtrInGeneric = new AddrSpaceCastInst(PtrInShared, Ptr->getType(), + Ptr->getName(), &*InsertPt); + // Replace with PtrInGeneric all uses of Ptr except PtrInShared. + Ptr->replaceAllUsesWith(PtrInGeneric); + PtrInShared->setOperand(0, Ptr); +} + +// ============================================================================= +// Main function for this pass. +// ============================================================================= +bool NVPTXFunctionDataSharing::runOnKernelFunction(Function &F) { + bool Modified = false; + + // Skip pass if no data sharing is required. + if (!F.hasFnAttribute("has-nvptx-shared-depot")) + return false; + + for (auto &B : F) { + for (auto &I : B) { + auto *AI = dyn_cast(&I); + if (!AI) + continue; + if (AI->getType()->isPointerTy() && ptrIsStored(AI)) { + markPointerAsShared(AI); + Modified = true; + } + } + } + + return Modified; +} + +// Device functions only need to copy byval args into local memory. +bool NVPTXFunctionDataSharing::runOnDeviceFunction(Function &F) { + return true; +} + +bool NVPTXFunctionDataSharing::runOnFunction(Function &F) { + return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); +} + +FunctionPass * +llvm::createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM) { + return new NVPTXFunctionDataSharing(TM); +} Index: lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXInstrInfo.td +++ lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1583,6 +1583,10 @@ "mov.u32 \t$d, __local_depot$num;", []>; def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), "mov.u64 \t$d, __local_depot$num;", []>; + def MOV_SHARED_DEPOT_ADDR : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num), + "mov.u32 \t$d, __shared_depot$num;", []>; + def MOV_SHARED_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num), + "mov.u64 \t$d, __shared_depot$num;", []>; } Index: lib/Target/NVPTX/NVPTXLowerAlloca.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -23,6 +23,9 @@ // And we will rely on NVPTXInferAddressSpaces to combine the last two // instructions. // +// In the case of OpenMP shared variables, perform the same transformation as +// for local variables but using the shared address space. +// //===----------------------------------------------------------------------===// #include "NVPTX.h" @@ -71,13 +74,36 @@ Changed = true; auto PTy = dyn_cast(allocaInst->getType()); auto ETy = PTy->getElementType(); - auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); - auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); - auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + + // In the CUDA case, this is always a local address. + // In offloading to a device using OpenMP this may be an + // address allocated in the shared memory of the device. + auto *AddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + bool PtrIsStored = ptrIsStored(allocaInst); + bool RequiresSharedMemory = + BB.getParent()->hasFnAttribute("has-nvptx-shared-depot"); + + // Handle shared args: currently shared args are declared as + // an alloca in LLVM-IR code generation and lowered to + // shared memory. + if (PtrIsStored && RequiresSharedMemory) + AddrTy = PointerType::get(ETy, ADDRESS_SPACE_SHARED); + + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, AddrTy, ""); + auto *GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, ""); NewASCToLocal->insertAfter(allocaInst); NewASCToGeneric->insertAfter(NewASCToLocal); + + // If a value is shared then the additional conversions are required for + // correctness. + if (PtrIsStored && RequiresSharedMemory) { + allocaInst->replaceAllUsesWith(NewASCToGeneric); + NewASCToLocal->setOperand(0, allocaInst); + continue; + } + for (Value::use_iterator UI = allocaInst->use_begin(), UE = allocaInst->use_end(); UI != UE; ) { @@ -93,9 +119,15 @@ continue; } auto SI = dyn_cast(AllocaUse.getUser()); - if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { - SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); - continue; + if (SI && !SI->isVolatile()){ + unsigned Idx; + if (SI->getPointerOperand() == allocaInst) + Idx = SI->getPointerOperandIndex(); + else if (SI->getValueOperand() == allocaInst) + Idx = 0; + else + continue; + SI->setOperand(Idx, NewASCToGeneric); } auto GI = dyn_cast(AllocaUse.getUser()); if (GI && GI->getPointerOperand() == allocaInst) { Index: lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp =================================================================== --- /dev/null +++ lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp @@ -0,0 +1,296 @@ +//===-- NVPTXLowerSharedFrameIndicesPass.cpp - NVPTX lowering ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the frame indices that need to reference the shared memry +// depot to the shared framed virtual register VRShared. This pass only performs +// the lowering if the function has the has-nvptx-shared-depot attribute. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "NVPTXRegisterInfo.h" +#include "NVPTXSubtarget.h" +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "nvptx-lower-shared-frame-indices" + +namespace { +class NVPTXLowerSharedFrameIndicesPass : public MachineFunctionPass { +public: + static char ID; + NVPTXLowerSharedFrameIndicesPass() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + void calculateSharedFrameObjectOffsets(MachineFunction &Fn); +}; +} + +MachineFunctionPass *llvm::createNVPTXLowerSharedFrameIndicesPass() { + return new NVPTXLowerSharedFrameIndicesPass(); +} + +char NVPTXLowerSharedFrameIndicesPass::ID = 0; + +static bool isSharedFrame( + MachineBasicBlock::iterator II, + MachineFunction &MF) { + MachineInstr ¤tMI = *II; + + if (!currentMI.getOperand(0).isReg()) + return false;; + + bool useSharedFrame = false; + unsigned AllocRegisterNumber = currentMI.getOperand(0).getReg(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == NVPTX::cvta_to_shared_yes_64 || + MI.getOpcode() == NVPTX::cvta_to_shared_yes) { + if (AllocRegisterNumber == MI.getOperand(1).getReg()) { + useSharedFrame = true; + break; + } + } + } + } + return useSharedFrame; +} + +bool NVPTXLowerSharedFrameIndicesPass::runOnMachineFunction(MachineFunction &MF) { + bool Modified = false; + bool IsKernel = isKernelFunction(MF.getFunction()); + + // Skip pass if function is not the kernel. + if (!IsKernel) + return Modified; + + // Skip pass if no data sharing is required. + // TODO: At the moment there is no limit enforced on the amount + // of shared memory used for this scheme. A limit will be + // introduced once a fallback scheme that uses global memory is + // in place: when the size of the shared depot exceeds a certain + // threshhold then the global memory scheme should be used instead. + if (!MF.getFunction().hasFnAttribute("has-nvptx-shared-depot")) + return Modified; + + SmallVector SharedFrameIndices; + + calculateSharedFrameObjectOffsets(MF); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (!MI.getOperand(i).isFI()) + continue; + + if (i + 1 >= MI.getNumOperands()) + continue; + + bool IsSharedFrame = false; + int FrameIndex = MI.getOperand(i).getIndex(); + + for(int SFI : SharedFrameIndices) + if (FrameIndex == SFI) + IsSharedFrame = true; + + if (!IsSharedFrame && isSharedFrame(MI, MF)) { + SharedFrameIndices.push_back(FrameIndex); + IsSharedFrame = true; + } + + if (IsSharedFrame) { + // Change Frame index to use shared stack. + MachineFunction &MF = *MI.getParent()->getParent(); + int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) + + MI.getOperand(i + 1).getImm(); + + // Using I0 as the frame pointer + // For shared data use the appropriate virtual register: VRShared + MI.getOperand(i).ChangeToRegister(NVPTX::VRShared, false); + MI.getOperand(i + 1).ChangeToImmediate(Offset); + } + Modified = true; + } + } + } + + return Modified; +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI.getObjectSize(FrameIdx); + + unsigned Align = MFI.getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI.setObjectOffset(FrameIdx, Offset); + Offset += MFI.getObjectSize(FrameIdx); + } +} + +/// This function computes the offset inside the shared stack. +/// +/// TODO: For simplicity, currently, the offsets conincide with +/// the local stack frame offsets - the local and stack frame +/// offsets are the same length. +void +NVPTXLowerSharedFrameIndicesPass::calculateSharedFrameObjectOffsets( + MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo &MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI.getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // NOTE: We do not have a call stack + + unsigned MaxAlign = MFI.getMaxAlignment(); + + // No scavenger + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI.getUseLocalStackAllocationBlock()) { + unsigned Align = MFI.getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = (Offset + Align - 1) / Align * Align; + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) { + std::pair Entry = MFI.getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI.setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI.getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // No stack protector + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { + if (MFI.isObjectPreAllocated(i) && + MFI.getUseLocalStackAllocationBlock()) + continue; + if (MFI.isDeadObjectIndex(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); + } + + // No scavenger + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI.getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI.adjustsStack() || MFI.hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + unsigned AlignMask = StackAlign - 1; + Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI.setStackSize(StackSize); +} Index: lib/Target/NVPTX/NVPTXRegisterInfo.h =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.h +++ lib/Target/NVPTX/NVPTXRegisterInfo.h @@ -45,6 +45,8 @@ unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getSharedFrameRegister(const MachineFunction &MF) const; + ManagedStringPool *getStrPool() const { return const_cast(&ManagedStrPool); } Index: lib/Target/NVPTX/NVPTXRegisterInfo.cpp =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -130,3 +130,8 @@ unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return NVPTX::VRFrame; } + +unsigned NVPTXRegisterInfo::getSharedFrameRegister( + const MachineFunction &MF) const { + return NVPTX::VRShared; +} Index: lib/Target/NVPTX/NVPTXRegisterInfo.td =================================================================== --- lib/Target/NVPTX/NVPTXRegisterInfo.td +++ lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -25,9 +25,12 @@ // Special Registers used as stack pointer def VRFrame : NVPTXReg<"%SP">; def VRFrameLocal : NVPTXReg<"%SPL">; +def VRShared : NVPTXReg<"%SPS">; +def VRFrameShared : NVPTXReg<"%SPSH">; // Special Registers used as the stack def VRDepot : NVPTXReg<"%Depot">; +def VRSharedDepot : NVPTXReg<"%SharedDepot">; // We use virtual registers, but define a few physical registers here to keep // SDAG and the MachineInstr layers happy. @@ -70,4 +73,4 @@ // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used. def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, - (sequence "ENVREG%u", 0, 31))>; + VRShared, VRFrameShared, VRSharedDepot, (sequence "ENVREG%u", 0, 31))>; Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp =================================================================== --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); } // end namespace llvm @@ -72,6 +73,7 @@ initializeNVPTXAssignValidGlobalNamesPass(PR); initializeNVPTXLowerArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXFunctionDataSharingPass(PR); initializeNVPTXLowerAggrCopiesPass(PR); } @@ -148,6 +150,7 @@ bool addInstSelector() override; void addPostRegAlloc() override; void addMachineSSAOptimization() override; + void addMachineSSALowering() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -247,10 +250,16 @@ // before the address space inference passes. addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); if (getOptLevel() != CodeGenOpt::None) { + // Add address space inference passes addAddressSpaceInferencePasses(); if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addStraightLineScalarOptimizationPasses(); + } else { + // When the shared depot is generated, even when no optimizations are + // used, we need to lower certain alloca instructions to the appropriate + // memory type for correctness. + addPass(createNVPTXFunctionDataSharingPass(&getNVPTXTargetMachine())); } // === LSR and other generic IR passes === @@ -328,6 +337,11 @@ printAndVerify("After StackSlotColoring"); } +void NVPTXPassConfig::addMachineSSALowering() { + // Lower shared frame indices. + addPass(createNVPTXLowerSharedFrameIndicesPass(), false); +} + void NVPTXPassConfig::addMachineSSAOptimization() { // Pre-ra tail duplication. if (addPass(&EarlyTailDuplicateID)) @@ -337,6 +351,11 @@ // instructions dead. addPass(&OptimizePHIsID); + // To avoid SSA optimizations on the local frame indices from treating + // shared and local frame indices the same, we will lower shared frame + // before the optimizations are applied. + addMachineSSALowering(); + // This pass merges large allocas. StackSlotColoring is a different pass // which merges spill slots. addPass(&StackColoringID); Index: lib/Target/NVPTX/NVPTXUtilities.h =================================================================== --- lib/Target/NVPTX/NVPTXUtilities.h +++ lib/Target/NVPTX/NVPTXUtilities.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -60,6 +62,8 @@ bool getAlign(const Function &, unsigned index, unsigned &); bool getAlign(const CallInst &, unsigned index, unsigned &); +bool ptrIsStored(Value *Ptr); + } #endif Index: lib/Target/NVPTX/NVPTXUtilities.cpp =================================================================== --- lib/Target/NVPTX/NVPTXUtilities.cpp +++ lib/Target/NVPTX/NVPTXUtilities.cpp @@ -12,6 +12,7 @@ #include "NVPTXUtilities.h" #include "NVPTX.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -28,6 +29,8 @@ namespace llvm { +#define DEBUG_TYPE "nvptx-utilities" + namespace { typedef std::map > key_val_pair_t; typedef std::map global_val_annot_t; @@ -314,4 +317,10 @@ return false; } +/// Returns true if there are any instructions storing +/// the address of this pointer. +bool ptrIsStored(Value *Ptr) { + return PointerMayBeCaptured(Ptr, false, true); +} + } // namespace llvm Index: test/CodeGen/NVPTX/insert-shared-depot.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/insert-shared-depot.ll @@ -0,0 +1,42 @@ +; Check that the shared depot is emitted when the "has-nvptx-shared-depot" is used. + +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefixes=PTX32,CHECK +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefixes=PTX64,CHECK + +; CHECK: {{.*}}kernel() + +; PTX32: .local .align 8{{.*}}.b8{{.*}}__local_depot0 +; PTX64: .local .align 8{{.*}}.b8{{.*}}__local_depot0 + +; PTX32: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0 +; PTX64: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0 + +; PTX32: .reg .b32{{.*}}%SPS; +; PTX64: .reg .b64{{.*}}%SPS; + +; PTX32: .reg .b32{{.*}}%SPSH; +; PTX64: .reg .b64{{.*}}%SPSH; + +; PTX32: mov.u32{{.*}}%SPSH, __shared_depot0; +; PTX64: mov.u64{{.*}}%SPSH, __shared_depot0; + +; PTX32: cvta.shared.u32{{.*}}%SPS, %SPSH; +; PTX64: cvta.shared.u64{{.*}}%SPS, %SPSH; + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +define void @kernel() #0 { + %A = alloca i32, align 4 + %shared_args = alloca i8**, align 8 + call void @callee(i8*** %shared_args) + store i32 10, i32* %A + ret void +} + +declare void @callee(i8***) + +attributes #0 = {"has-nvptx-shared-depot"} + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1} Index: test/CodeGen/NVPTX/lower-alloca-shared.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/lower-alloca-shared.ll @@ -0,0 +1,32 @@ +; Test case for NVPTXLowerAlloca pass. + +; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +define void @kernel() #0 { +; PTX-LABEL: .visible .entry kernel( + %A = alloca i32 +; CHECK: addrspacecast i32* %A to i32 addrspace(3)* +; CHECK: addrspacecast i32 addrspace(3)* %1 to i32* +; CHECK: store i32 0, i32 addrspace(3)* {{%.+}} +; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0; +; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}}; +; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}} + %shared_args = alloca i32** + call void @callee(i32*** %shared_args) + %1 = load i32**, i32*** %shared_args + %2 = getelementptr inbounds i32*, i32** %1, i64 0 + store i32* %A, i32** %2 + store i32 0, i32* %A + ret void +} + +declare void @callee(i32***) + +attributes #0 = {"has-nvptx-shared-depot"} + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1} Index: test/CodeGen/NVPTX/no-shared-depot.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/no-shared-depot.ll @@ -0,0 +1,43 @@ +; Test case for when the shared depot does not need to be emitted. +; The shared depot is not emitted if the function does not have the +; "has-nvptx-shared-depot" attribute. + +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 + +; PTX32: {{.*}}kernel() +; PTX64: {{.*}}kernel() + +; PTX32: .local .align 8{{.*}}.b8{{.*}}__local_depot0 +; PTX64: .local .align 8{{.*}}.b8{{.*}}__local_depot0 + +; PTX32-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0 +; PTX64-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0 + +; PTX32-NOT: .reg .b32{{.*}}%SPS; +; PTX64-NOT: .reg .b64{{.*}}%SPS; + +; PTX32-NOT: .reg .b32{{.*}}%SPSH; +; PTX64-NOT: .reg .b64{{.*}}%SPSH; + +; PTX32-NOT: mov.u32{{.*}}%SPSH, __shared_depot0; +; PTX64-NOT: mov.u64{{.*}}%SPSH, __shared_depot0; + +; PTX32-NOT: cvta.shared.u32{{.*}}%SPS, %SPSH; +; PTX64-NOT: cvta.shared.u64{{.*}}%SPS, %SPSH; + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +define void @kernel() { + %A = alloca i32, align 4 + %shared_args = alloca i8**, align 8 + call void @callee(i8*** %shared_args) + store i32 10, i32* %A + ret void +} + +declare void @callee(i8***) + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1} Index: test/CodeGen/NVPTX/nvptx-function-data-sharing.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/nvptx-function-data-sharing.ll @@ -0,0 +1,32 @@ +; Test case for NVPTXFunctionDataSharing pass. + +; RUN: opt < %s -S -nvptx-function-data-sharing -infer-address-spaces | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +define void @kernel() #0 { +; PTX-LABEL: .visible .entry kernel( + %A = alloca i32 +; CHECK: addrspacecast i32* %A to i32 addrspace(3)* +; CHECK: addrspacecast i32 addrspace(3)* %A1 to i32* +; CHECK: store i32 0, i32 addrspace(3)* {{%.+}} +; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0; +; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}}; +; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}} + %shared_args = alloca i32** + call void @callee(i32*** %shared_args) + %1 = load i32**, i32*** %shared_args + %2 = getelementptr inbounds i32*, i32** %1, i64 0 + store i32* %A, i32** %2 + store i32 0, i32* %A + ret void +} + +declare void @callee(i32***) + +attributes #0 = {"has-nvptx-shared-depot"} + +!nvvm.annotations = !{!0} +!0 = !{void ()* @kernel, !"kernel", i32 1}