Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -86,6 +86,13 @@ cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); +static cl::opt ManagedMemory("polly-acc-codegen-managed-memory", + cl::desc("Generate Host kernel code assuming" + " that all memory has been" + " declared as managed memory"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, + cl::cat(PollyCategory)); + static cl::opt CudaVersion("polly-acc-cuda-version", cl::desc("The CUDA version to compile for"), cl::Hidden, @@ -243,6 +250,14 @@ /// @returns A tuple with grid sizes for X and Y dimension std::tuple getGridSizes(ppcg_kernel *Kernel); + /// Creates a array that can be sent to the kernel on the device using a + /// host pointer. This is required for managed memory, when we directly send + /// host pointers to the device. + /// \note + /// This is to be used only with managed memory + Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, + ScopArrayInfo *ArrayInfo); + /// Compute the sizes of the thread blocks for a given kernel. /// /// @param Kernel The kernel to compute thread block sizes for. @@ -449,6 +464,11 @@ void createCallCopyFromDeviceToHost(Value *DevicePtr, Value *HostPtr, Value *Size); + /// Create a call to synchronize Host & Device. + /// \note + /// This is to be used only with managed memory. + void createCallSynchronizeDevice(); + /// Create a call to get a kernel from an assembly string. /// /// @param Buffer The string describing the kernel. @@ -485,16 +505,23 @@ Builder.SetInsertPoint(&NewBB->front()); GPUContext = createCallInitContext(); - allocateDeviceArrays(); + + if (!ManagedMemory) { + allocateDeviceArrays(); + } } void GPUNodeBuilder::finalize() { - freeDeviceArrays(); + if (!ManagedMemory) { + freeDeviceArrays(); + } createCallFreeContext(GPUContext); IslNodeBuilder::finalize(); } void GPUNodeBuilder::allocateDeviceArrays() { + assert(!ManagedMemory && "managed memory will directly send host pointers " + "to the kernel. There is no need for device arrays"); isl_ast_build *Build = isl_ast_build_from_context(S.getContext()); for (int i = 0; i < Prog->n_array; ++i) { @@ -540,6 +567,7 @@ } void GPUNodeBuilder::freeDeviceArrays() { + assert(!ManagedMemory && "Managed memory does not use device arrays"); for (auto &Array : DeviceAllocations) createCallFreeDeviceMemory(Array.second); } @@ -624,6 +652,7 @@ } void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { + assert(!ManagedMemory); const char *Name = "polly_freeDeviceMemory"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -641,6 +670,8 @@ } Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { + assert(!ManagedMemory && + "device memory allocation is not used by Managed Memory"); const char *Name = "polly_allocateMemoryForDevice"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -660,6 +691,7 @@ void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, Value *DeviceData, Value *Size) { + assert(!ManagedMemory); const char *Name = "polly_copyFromHostToDevice"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -681,6 +713,7 @@ void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, Value *HostData, Value *Size) { + assert(!ManagedMemory); const char *Name = "polly_copyFromDeviceToHost"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -699,6 +732,22 @@ Builder.CreateCall(F, {DeviceData, HostData, Size}); } +void GPUNodeBuilder::createCallSynchronizeDevice() { + assert(ManagedMemory); + const char *Name = "polly_synchronizeDevice"; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Function *F = M->getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); + F = Function::Create(Ty, Linkage, Name, M); + } + + Builder.CreateCall(F); +} + Value *GPUNodeBuilder::createCallInitContext() { const char *Name = "polly_initContext"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -805,8 +854,40 @@ return ResultValue; } +Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, + ScopArrayInfo *ArrayInfo) { + + assert(ManagedMemory && "this is only used when you wish to get a host " + "pointer for sending data to the kernel, " + "with managed memory"); + std::map::iterator it; + if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { + assert(false && "multiple allocations for same scop"); + return it->second; + } else { + Value *HostPtr; + + if (gpu_array_is_scalar(Array)) + HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); + else + HostPtr = ArrayInfo->getBasePtr(); + + Value *Offset = getArrayOffset(Array); + if (Offset) { + HostPtr = Builder.CreatePointerCast( + HostPtr, ArrayInfo->getElementType()->getPointerTo()); + HostPtr = Builder.CreateGEP(HostPtr, Offset); + } + + HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); + DeviceAllocations[ArrayInfo] = HostPtr; + return HostPtr; + } +} + void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, enum DataDirection Direction) { + assert(!ManagedMemory && "managed memory needs no data transfers"); isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); isl_id *Id = isl_ast_expr_get_id(Arg); @@ -864,13 +945,22 @@ } if (isPrefix(Str, "to_device")) { - createDataTransfer(UserStmt, HOST_TO_DEVICE); + if (!ManagedMemory) { + createDataTransfer(UserStmt, HOST_TO_DEVICE); + } else { + isl_ast_node_free(UserStmt); + } isl_ast_expr_free(Expr); return; } if (isPrefix(Str, "from_device")) { - createDataTransfer(UserStmt, DEVICE_TO_HOST); + if (!ManagedMemory) { + createDataTransfer(UserStmt, DEVICE_TO_HOST); + } else { + createCallSynchronizeDevice(); + isl_ast_node_free(UserStmt); + } isl_ast_expr_free(Expr); return; } @@ -1096,9 +1186,16 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); - Value *DevArray = DeviceAllocations[const_cast(SAI)]; - DevArray = createCallGetDevicePtr(DevArray); - + Value *DevArray = nullptr; + if (ManagedMemory) { + DevArray = getOrCreateManagedDeviceArray( + &Prog->array[i], const_cast(SAI)); + } else { + DevArray = DeviceAllocations[const_cast(SAI)]; + DevArray = createCallGetDevicePtr(DevArray); + } + assert(DevArray != nullptr && "Array to be offloaded to device not " + "initialized"); Value *Offset = getArrayOffset(&Prog->array[i]); if (Offset) { @@ -1111,7 +1208,13 @@ Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); if (gpu_array_is_read_only_scalar(&Prog->array[i])) { - Value *ValPtr = BlockGen.getOrCreateAlloca(SAI); + Value *ValPtr = nullptr; + if (ManagedMemory) { + ValPtr = DevArray; + } else { + ValPtr = BlockGen.getOrCreateAlloca(SAI); + } + assert(ValPtr != nullptr); Value *ValPtrCast = Builder.CreatePointerCast(ValPtr, Builder.getInt8PtrTy()); Builder.CreateStore(ValPtrCast, Slot); Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- /dev/null +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -0,0 +1,118 @@ +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \ +; RUN: FileCheck %s + +; REQUIRES: pollyacc + +; +; #include +; +; static const int N = 45; +; +; void copy(int *R, int *A) { +; for (int i = 0; i < N; i++) { +; R[i] = A[i] * 10; +; } +; } +; +; int main() { +; int *A, *R; +; +; cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal); +; cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal); +; +; for (int i = 0; i < N; i++) { +; A[i] = i; +; R[i] = 0; +; } +; copy(R, A); +; +; return 0; +; } +; + +; CHECK-NOT: polly_copyFromHostToDevice +; CHECK-NOT: polly_copyFromDeviceToHost +; CHECK-NOT: polly_freeDeviceMemory +; CHECK-NOT: polly_allocateMemoryForDevice + +; CHECK: %13 = call i8* @polly_initContext() +; CHECK-NEXT: %14 = bitcast i32* %A to i8* +; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 +; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8* +; CHECK-NEXT: store i8* %16, i8** %15 +; CHECK-NEXT: %17 = bitcast i32* %R to i8* +; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 +; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* +; CHECK-NEXT: store i8* %19, i8** %18 +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([820 x i8], [820 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %20) +; CHECK-NEXT: call void @polly_synchronizeDevice() +; CHECK-NEXT: call void @polly_freeContext(i8* %13) + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define void @copy(i32* %R, i32* %A) { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 45 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %tmp = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %tmp, 10 + %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define i32 @main() { +entry: + %A = alloca i32*, align 8 + %R = alloca i32*, align 8 + %tmp = bitcast i32** %A to i8** + %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2 + %tmp1 = bitcast i32** %R to i8** + %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ] + %exitcond = icmp ne i64 %indvars.iv, 45 + br i1 %exitcond, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %tmp2 = load i32*, i32** %A, align 8 + %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv + %tmp3 = trunc i64 %indvars.iv to i32 + store i32 %tmp3, i32* %arrayidx, align 4 + %tmp4 = load i32*, i32** %R, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv + store i32 0, i32* %arrayidx3, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %tmp5 = load i32*, i32** %R, align 8 + %tmp6 = load i32*, i32** %A, align 8 + call void @copy(i32* %tmp5, i32* %tmp6) + ret i32 0 +} + +declare i32 @cudaMallocManaged(i8**, i64, i32) #1 Index: tools/GPURuntime/GPUJIT.h =================================================================== --- tools/GPURuntime/GPUJIT.h +++ tools/GPURuntime/GPUJIT.h @@ -88,6 +88,7 @@ long MemSize); void polly_copyFromDeviceToHost(PollyGPUDevicePtr *DevData, void *HostData, long MemSize); +void polly_synchronizeDevice(); void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, unsigned int GridDimY, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ, Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -130,6 +130,9 @@ typedef CUresult CUDAAPI CuLinkDestroyFcnTy(CUlinkState state); static CuLinkDestroyFcnTy *CuLinkDestroyFcnPtr; +typedef CUresult CUDAAPI CuCtxSynchronizeFcnTy(); +static CuCtxSynchronizeFcnTy *CuCtxSynchronizeFcnPtr; + /* Type-defines of function pointer ot CUDA runtime APIs. */ typedef cudaError_t CUDARTAPI CudaThreadSynchronizeFcnTy(void); static CudaThreadSynchronizeFcnTy *CudaThreadSynchronizeFcnPtr; @@ -233,6 +236,9 @@ CuLinkDestroyFcnPtr = (CuLinkDestroyFcnTy *)getAPIHandle(HandleCuda, "cuLinkDestroy"); + CuCtxSynchronizeFcnPtr = + (CuCtxSynchronizeFcnTy *)getAPIHandle(HandleCuda, "cuCtxSynchronize"); + /* Get function pointer to CUDA Runtime APIs. */ CudaThreadSynchronizeFcnPtr = (CudaThreadSynchronizeFcnTy *)getAPIHandle( HandleCudaRT, "cudaThreadSynchronize"); @@ -436,6 +442,13 @@ exit(-1); } } +void polly_synchronizeDevice() { + dump_function(); + if (CuCtxSynchronizeFcnPtr() != CUDA_SUCCESS) { + fprintf(stdout, "Synchronizing device and host memory failed.\n"); + exit(-1); + } +} void polly_launchKernel(PollyGPUFunction *Kernel, unsigned int GridDimX, unsigned int GridDimY, unsigned int BlockDimX,