Index: polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h =================================================================== --- polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h +++ polly/trunk/include/polly/CodeGen/PPCGCodeGeneration.h @@ -21,4 +21,8 @@ /// The GPU Runtime implementation to use. enum GPURuntime { CUDA, OpenCL }; +namespace polly { +extern bool PollyManagedMemory; +} + #endif // POLLY_PPCGCODEGENERATION_H Index: polly/trunk/include/polly/LinkAllPasses.h =================================================================== --- polly/trunk/include/polly/LinkAllPasses.h +++ polly/trunk/include/polly/LinkAllPasses.h @@ -51,6 +51,10 @@ #ifdef GPU_CODEGEN llvm::Pass *createPPCGCodeGenerationPass(GPUArch Arch = GPUArch::NVPTX64, GPURuntime Runtime = GPURuntime::CUDA); + +llvm::Pass * +createManagedMemoryRewritePassPass(GPUArch Arch = GPUArch::NVPTX64, + GPURuntime Runtime = GPURuntime::CUDA); #endif llvm::Pass *createIslScheduleOptimizerPass(); llvm::Pass *createFlattenSchedulePass(); @@ -87,6 +91,7 @@ polly::createCodeGenerationPass(); #ifdef GPU_CODEGEN polly::createPPCGCodeGenerationPass(); + polly::createManagedMemoryRewritePassPass(); #endif polly::createIslScheduleOptimizerPass(); polly::createMaximalStaticExpansionPass(); @@ -109,6 +114,7 @@ void initializeCodeGenerationPass(llvm::PassRegistry &); #ifdef GPU_CODEGEN void initializePPCGCodeGenerationPass(llvm::PassRegistry &); +void initializeManagedMemoryRewritePassPass(llvm::PassRegistry &); #endif void initializeIslScheduleOptimizerPass(llvm::PassRegistry &); void initializeMaximalStaticExpanderPass(llvm::PassRegistry &); Index: polly/trunk/lib/CMakeLists.txt =================================================================== --- polly/trunk/lib/CMakeLists.txt +++ polly/trunk/lib/CMakeLists.txt @@ -9,6 +9,7 @@ if (GPU_CODEGEN) set (GPGPU_CODEGEN_FILES CodeGen/PPCGCodeGeneration.cpp + CodeGen/ManagedMemoryRewrite.cpp ) endif (GPU_CODEGEN) Index: polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp =================================================================== --- polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp +++ polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp @@ -0,0 +1,144 @@ +//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory. +//---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Take a module and rewrite: +// 1. `malloc` -> `polly_mallocManaged` +// 2. `free` -> `polly_freeManaged` +// 3. global arrays with initializers -> global arrays that are initialized +// with a constructor call to +// `polly_mallocManaged`. +// +//===----------------------------------------------------------------------===// + +#include "polly/CodeGen/CodeGeneration.h" +#include "polly/CodeGen/IslAst.h" +#include "polly/CodeGen/IslNodeBuilder.h" +#include "polly/CodeGen/PPCGCodeGeneration.h" +#include "polly/CodeGen/Utils.h" +#include "polly/DependenceInfo.h" +#include "polly/LinkAllPasses.h" +#include "polly/Options.h" +#include "polly/ScopDetection.h" +#include "polly/ScopInfo.h" +#include "polly/Support/SCEVValidator.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +namespace { + +static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) { + // TODO: should I allow this pass to be a standalone pass that + // doesn't care if PollyManagedMemory is enabled or not? + assert(PollyManagedMemory && + "One should only rewrite malloc & free to" + "polly_{malloc,free}Managed with managed memory enabled."); + const char *Name = "polly_mallocManaged"; + Function *F = M.getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + PollyIRBuilder Builder(M.getContext()); + // TODO: How do I get `size_t`? I assume from DataLayout? + FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(), + {Builder.getInt64Ty()}, false); + F = Function::Create(Ty, Linkage, Name, &M); + } + + return F; +} + +static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) { + // TODO: should I allow this pass to be a standalone pass that + // doesn't care if PollyManagedMemory is enabled or not? + assert(PollyManagedMemory && + "One should only rewrite malloc & free to" + "polly_{malloc,free}Managed with managed memory enabled."); + const char *Name = "polly_freeManaged"; + Function *F = M.getFunction(Name); + + // If F is not available, declare it. + if (!F) { + GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + PollyIRBuilder Builder(M.getContext()); + // TODO: How do I get `size_t`? I assume from DataLayout? + FunctionType *Ty = + FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false); + F = Function::Create(Ty, Linkage, Name, &M); + } + + return F; +} + +class ManagedMemoryRewritePass : public ModulePass { +public: + static char ID; + GPUArch Architecture; + GPURuntime Runtime; + ManagedMemoryRewritePass() : ModulePass(ID) {} + virtual bool runOnModule(Module &M) { + Function *Malloc = M.getFunction("malloc"); + + if (Malloc) { + Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M); + assert(PollyMallocManaged && "unable to create polly_mallocManaged"); + Malloc->replaceAllUsesWith(PollyMallocManaged); + } + + Function *Free = M.getFunction("free"); + + if (Free) { + Function *PollyFreeManaged = GetOrCreatePollyFreeManaged(M); + assert(PollyFreeManaged && "unable to create polly_freeManaged"); + Free->replaceAllUsesWith(PollyFreeManaged); + } + + return true; + } +}; + +} // namespace +char ManagedMemoryRewritePass::ID = 42; + +Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch, + GPURuntime Runtime) { + ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass(); + pass->Runtime = Runtime; + pass->Architecture = Arch; + return pass; +} + +INITIALIZE_PASS_BEGIN( + ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", + "Polly - Rewrite all allocations in heap & data section to managed memory", + false, false) +INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration); +INITIALIZE_PASS_DEPENDENCY(DependenceInfo); +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass); +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass); +INITIALIZE_PASS_DEPENDENCY(RegionInfoPass); +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass); +INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass); +INITIALIZE_PASS_END( + ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory", + "Polly - Rewrite all allocations in heap & data section to managed memory", + false, false) Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -90,12 +90,14 @@ cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); -static cl::opt ManagedMemory("polly-acc-codegen-managed-memory", - cl::desc("Generate Host kernel code assuming" - " that all memory has been" - " declared as managed memory"), - cl::Hidden, cl::init(false), cl::ZeroOrMore, - cl::cat(PollyCategory)); +bool polly::PollyManagedMemory; +static cl::opt + XManagedMemory("polly-acc-codegen-managed-memory", + cl::desc("Generate Host kernel code assuming" + " that all memory has been" + " declared as managed memory"), + cl::location(PollyManagedMemory), cl::Hidden, + cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); static cl::opt FailOnVerifyModuleFailure("polly-acc-fail-on-verify-module-failure", @@ -746,14 +748,14 @@ GPUContext = createCallInitContext(); - if (!ManagedMemory) + if (!PollyManagedMemory) allocateDeviceArrays(); else prepareManagedDeviceArrays(); } void GPUNodeBuilder::finalize() { - if (!ManagedMemory) + if (!PollyManagedMemory) freeDeviceArrays(); createCallFreeContext(GPUContext); @@ -761,8 +763,9 @@ } void GPUNodeBuilder::allocateDeviceArrays() { - assert(!ManagedMemory && "Managed memory will directly send host pointers " - "to the kernel. There is no need for device arrays"); + assert(!PollyManagedMemory && + "Managed memory will directly send host pointers " + "to the kernel. There is no need for device arrays"); isl_ast_build *Build = isl_ast_build_from_context(S.getContext().release()); for (int i = 0; i < Prog->n_array; ++i) { @@ -800,7 +803,7 @@ } void GPUNodeBuilder::prepareManagedDeviceArrays() { - assert(ManagedMemory && + assert(PollyManagedMemory && "Device array most only be prepared in managed-memory mode"); for (int i = 0; i < Prog->n_array; ++i) { gpu_array_info *Array = &Prog->array[i]; @@ -847,7 +850,7 @@ } void GPUNodeBuilder::freeDeviceArrays() { - assert(!ManagedMemory && "Managed memory does not use device arrays"); + assert(!PollyManagedMemory && "Managed memory does not use device arrays"); for (auto &Array : DeviceAllocations) createCallFreeDeviceMemory(Array.second); } @@ -932,8 +935,9 @@ } void GPUNodeBuilder::createCallFreeDeviceMemory(Value *Array) { - assert(!ManagedMemory && "Managed memory does not allocate or free memory " - "for device"); + assert(!PollyManagedMemory && + "Managed memory does not allocate or free memory " + "for device"); const char *Name = "polly_freeDeviceMemory"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -951,8 +955,9 @@ } Value *GPUNodeBuilder::createCallAllocateMemoryForDevice(Value *Size) { - assert(!ManagedMemory && "Managed memory does not allocate or free memory " - "for device"); + assert(!PollyManagedMemory && + "Managed memory does not allocate or free memory " + "for device"); const char *Name = "polly_allocateMemoryForDevice"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -972,8 +977,9 @@ void GPUNodeBuilder::createCallCopyFromHostToDevice(Value *HostData, Value *DeviceData, Value *Size) { - assert(!ManagedMemory && "Managed memory does not transfer memory between " - "device and host"); + assert(!PollyManagedMemory && + "Managed memory does not transfer memory between " + "device and host"); const char *Name = "polly_copyFromHostToDevice"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -995,8 +1001,9 @@ void GPUNodeBuilder::createCallCopyFromDeviceToHost(Value *DeviceData, Value *HostData, Value *Size) { - assert(!ManagedMemory && "Managed memory does not transfer memory between " - "device and host"); + assert(!PollyManagedMemory && + "Managed memory does not transfer memory between " + "device and host"); const char *Name = "polly_copyFromDeviceToHost"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1016,8 +1023,8 @@ } void GPUNodeBuilder::createCallSynchronizeDevice() { - assert(ManagedMemory && "explicit synchronization is only necessary for " - "managed memory"); + assert(PollyManagedMemory && "explicit synchronization is only necessary for " + "managed memory"); const char *Name = "polly_synchronizeDevice"; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Function *F = M->getFunction(Name); @@ -1144,9 +1151,9 @@ Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo) { - assert(ManagedMemory && "Only used when you wish to get a host " - "pointer for sending data to the kernel, " - "with managed memory"); + assert(PollyManagedMemory && "Only used when you wish to get a host " + "pointer for sending data to the kernel, " + "with managed memory"); std::map::iterator it; it = DeviceAllocations.find(ArrayInfo); assert(it != DeviceAllocations.end() && @@ -1156,7 +1163,7 @@ void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, enum DataDirection Direction) { - assert(!ManagedMemory && "Managed memory needs no data transfers"); + assert(!PollyManagedMemory && "Managed memory needs no data transfers"); isl_ast_expr *Expr = isl_ast_node_user_get_expr(TransferStmt); isl_ast_expr *Arg = isl_ast_expr_get_op_arg(Expr, 0); isl_id *Id = isl_ast_expr_get_id(Arg); @@ -1226,7 +1233,7 @@ return; } if (isPrefix(Str, "to_device")) { - if (!ManagedMemory) + if (!PollyManagedMemory) createDataTransfer(UserStmt, HOST_TO_DEVICE); else isl_ast_node_free(UserStmt); @@ -1236,7 +1243,7 @@ } if (isPrefix(Str, "from_device")) { - if (!ManagedMemory) { + if (!PollyManagedMemory) { createDataTransfer(UserStmt, DEVICE_TO_HOST); } else { createCallSynchronizeDevice(); @@ -1596,7 +1603,7 @@ ArgSizes[Index] = SAI->getElemSizeInBytes(); Value *DevArray = nullptr; - if (ManagedMemory) { + if (PollyManagedMemory) { DevArray = getManagedDeviceArray(&Prog->array[i], const_cast(SAI)); } else { @@ -1618,7 +1625,7 @@ if (gpu_array_is_read_only_scalar(&Prog->array[i])) { Value *ValPtr = nullptr; - if (ManagedMemory) + if (PollyManagedMemory) ValPtr = DevArray; else ValPtr = BlockGen.getOrCreateAlloca(SAI); Index: polly/trunk/lib/Support/RegisterPasses.cpp =================================================================== --- polly/trunk/lib/Support/RegisterPasses.cpp +++ polly/trunk/lib/Support/RegisterPasses.cpp @@ -245,6 +245,7 @@ #ifdef GPU_CODEGEN initializePPCGCodeGenerationPass(Registry); + initializeManagedMemoryRewritePassPass(Registry); LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetInfo(); LLVMInitializeNVPTXTargetMC(); @@ -345,9 +346,12 @@ PM.add(polly::createPruneUnprofitablePass()); #ifdef GPU_CODEGEN - if (Target == TARGET_HYBRID) + if (Target == TARGET_HYBRID) { PM.add( polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); + PM.add(polly::createManagedMemoryRewritePassPass(GPUArchChoice, + GPURuntimeChoice)); + } #endif if (Target == TARGET_CPU || Target == TARGET_HYBRID) switch (Optimizer) { @@ -374,9 +378,11 @@ break; } #ifdef GPU_CODEGEN - else + else { PM.add( polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); + PM.add(polly::createManagedMemoryRewritePassPass()); + } #endif // FIXME: This dummy ModulePass keeps some programs from miscompiling, Index: polly/trunk/test/GPGPU/managed-memory-rewrite-malloc-free.ll =================================================================== --- polly/trunk/test/GPGPU/managed-memory-rewrite-malloc-free.ll +++ polly/trunk/test/GPGPU/managed-memory-rewrite-malloc-free.ll @@ -0,0 +1,90 @@ +; RUN: opt %loadPolly -polly-scops \ +; RUN: -analyze < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR +; +; REQUIRES: pollyacc +; +; Check that we can correctly rewrite `malloc` to `polly_mallocManaged`, and +; `free` to `polly_freeManaged` with the `polly-acc-rewrite-managed-memory` +; pass. +; +; #include +; +; static const int N = 100; +; int* f(int *ToFree) { +; free(ToFree); +; int *A = (int *)malloc(sizeof(int) * N); +; for(int i = 0; i < N; i++) { +; A[i] = 42; +; } +; return A; +; +; } + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; SCOP: Arrays { +; SCOP-NEXT: i32 MemRef_call[*]; // Element size 4 +; SCOP-NEXT: } + +; // Check that polly_mallocManaged is declared and used correctly. +; HOST-IR: %call = tail call i8* @polly_mallocManaged(i64 400) +; HOST-IR: declare i8* @polly_mallocManaged(i64) + +; // Check that polly_freeManaged is declared and used correctly. +; HOST-IR %toFreeBitcast = bitcast i32* %toFree to i8* +; HOST-IR call void @polly_freeManaged(i8* %toFreeBitcast) +; HOST-IR: declare void @polly_freeManaged(i8*) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define i32* @f(i32 *%toFree) { +entry: + %toFreeBitcast = bitcast i32* %toFree to i8* + call void @free(i8* %toFreeBitcast) + br label %entry.split + +entry.split: ; preds = %entry + %call = tail call i8* @malloc(i64 400) + %tmp = bitcast i8* %call to i32* + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32* %tmp +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + +declare i8* @malloc(i64) +declare void @free(i8*) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} Index: polly/trunk/test/GPGPU/managed-memory-rewrite-malloc.ll =================================================================== --- polly/trunk/test/GPGPU/managed-memory-rewrite-malloc.ll +++ polly/trunk/test/GPGPU/managed-memory-rewrite-malloc.ll @@ -0,0 +1,77 @@ +; RUN: opt %loadPolly -polly-scops \ +; RUN: -analyze < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -polly-codegen-ppcg \ +; RUN: -S -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR +; +; Check that we can correctly rewrite `malloc` to `polly_mallocManaged` +; +; #include +; +; static const int N = 100; +; int* f() { +; int *A = (int *)malloc(sizeof(int) * N); +; for(int i = 0; i < N; i++) { +; A[i] = 42; +; } +; return A; +; +; } + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 + +; SCOP: Arrays { +; SCOP-NEXT: i32 MemRef_call[*]; // Element size 4 +; SCOP-NEXT: } + +; // Check that polly_mallocManaged is declared and used correctly. +; HOST-IR: %call = tail call i8* @polly_mallocManaged(i64 400) +; HOST-IR: declare i8* @polly_mallocManaged(i64) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +define i32* @f() { +entry: + br label %entry.split + +entry.split: ; preds = %entry + %call = tail call i8* @malloc(i64 400) + %tmp = bitcast i8* %call to i32* + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %tmp, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32* %tmp +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + +declare i8* @malloc(i64) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"}