Index: lib/CodeGen/ManagedMemoryRewrite.cpp =================================================================== --- lib/CodeGen/ManagedMemoryRewrite.cpp +++ lib/CodeGen/ManagedMemoryRewrite.cpp @@ -1,5 +1,4 @@ -//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory. -//---===// +//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===// // // The LLVM Compiler Infrastructure // @@ -30,10 +29,12 @@ #include "polly/Support/SCEVValidator.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DerivedUser.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" @@ -43,14 +44,20 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +static cl::opt RewriteAllocas( + "polly-acc-rewrite-allocas", + cl::desc( + "Ask the managed memory rewriter to also rewrite alloca instructions"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +#define DEBUG_TYPE "polly-acc-rewrite-managed-memory" namespace { static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) { // TODO: should I allow this pass to be a standalone pass that // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); const char *Name = "polly_mallocManaged"; Function *F = M.getFunction(Name); @@ -70,9 +77,6 @@ static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) { // TODO: should I allow this pass to be a standalone pass that // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); const char *Name = "polly_freeManaged"; Function *F = M.getFunction(Name); @@ -89,13 +93,258 @@ return F; } +// Expand the constant expression Cur using Builder. This will recursively +// expand `Cur` to arrive at a set of instructions. +// `Expands` is populated all the expanded instructions. +// NOTE: this simply `insert`s into Expands. +static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder, + Instruction *Parent, int index, + SmallPtrSet &Expands) { + DEBUG(dbgs() << "\n\n\nExpanding: " << *Cur << "\n"; + dbgs() << "Parent: " << *Parent << "\n";); + assert(Cur && "invalid constant expression passed"); + SmallVector, 2> Replacements; + + Instruction *I = Cur->getAsInstruction(); + Expands.insert(I); + Parent->setOperand(index, I); + + assert(I && "unable to convert ConstantExpr to Instruction"); + // I need instructions to be created before this. + Builder.SetInsertPoint(Parent); + Builder.Insert(I); + + DEBUG(dbgs() << "Expanded: " << *I << "\n";); + for (unsigned i = 0; i < Cur->getNumOperands(); i++) { + Constant *COp = dyn_cast(Cur->getOperand(i)); + assert(COp && "constant must have a constant operand"); + if (isa(COp)) { + expandConstantExpr(dyn_cast(COp), Builder, I, i, Expands); + }; + } +} + +// Edit all uses of `ArrPtrToRewrite` to `NewLoadedPtr` in `Inst`. +// This will change all `GEP`s into `ArrPtrToRewrite` to `NewLoadedPtr`, +// re-indexing the GEPs correctly as well. It will change all raw uses of +// `ArrPtrToRewrite` to `NewBitcastedPtr`. +static void rewriteArrToPtr(Instruction *Inst, Value *ArrPtrToRewrite, + Value *NewLoadedPtr, Value *NewBitcastedPtr, + PollyIRBuilder &Builder, + SmallPtrSet &InstsToBeDeleted) { + + // We use a worklist based algorithm that keep the frontier of + // `User`s we need to rewrite in `Next`, and the current iterations + // in `Current`. + SmallPtrSet Next; + SmallPtrSet Current = {Inst}; + + while (!Current.empty()) { + + for (Instruction *CurInst : Current) { + + Builder.SetInsertPoint(CurInst); + for (unsigned i = 0; i < CurInst->getNumOperands(); i++) { + User *OperandAsUser = dyn_cast(CurInst->getOperand(i)); + assert(OperandAsUser && "operandAsUser obtained was not a User."); + + if (isa(OperandAsUser)) { + if (OperandAsUser == ArrPtrToRewrite) { + CurInst->setOperand(i, NewBitcastedPtr); + } + } else { + assert(isa(OperandAsUser)); + + if (isa(OperandAsUser) || + isa(OperandAsUser) || + isa(OperandAsUser)) + continue; + + if (isa(OperandAsUser)) { + if (OperandAsUser == ArrPtrToRewrite) { + CurInst->setOperand(i, NewBitcastedPtr); + } + continue; + } + + // Only things that can contain a reference is a ConstantExpr + ConstantExpr *ValueConstExpr = dyn_cast(OperandAsUser); + assert(ValueConstExpr && "this must be a ValueConstExpr"); + + expandConstantExpr(ValueConstExpr, Builder, CurInst, i, Next); + + } // end else + } // end operands for + } // end for current + + Current.clear(); + Current = Next; + + Next.clear(); + } // end worklist +} + +// Given a value `Current`, return all Instructions that may contain `Current` +// in an expression. +static void getContainingInstructions(Value *Current, + std::vector &Owners) { + if (auto *I = dyn_cast(Current)) { + Owners.push_back(I); + } else { + // Anything that is a `User` must be a constant or an instruction? + // (what about DerivedUser) + auto *C = cast(Current); + for (Use &CUse : C->uses()) + getContainingInstructions(CUse.getUser(), Owners); + } +} + +static void +replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array, + SmallPtrSet &ReplacedGlobals, + SmallPtrSet &InstsToBeDeleted) { + static const unsigned AddrSpace = 0; + // We only want arrays. + ArrayType *ArrayTy = dyn_cast(Array.getType()->getElementType()); + if (!ArrayTy) + return; + Type *ElemTy = ArrayTy->getElementType(); + PointerType *ElemPtrTy = PointerType::get(ElemTy, AddrSpace); + + // We only wish to replace stuff with internal linkage. Otherwise, + // our type edit from [T] to T* would be illegal across modules. + // It is interesting that most arrays don't seem to be tagged with internal + // linkage? + if (GlobalValue::isWeakForLinker(Array.getLinkage()) && false) { + return; + } + + if (!Array.hasInitializer() || + !isa(Array.getInitializer())) + return; + + // At this point, we have committed to replacing this array. + ReplacedGlobals.insert(&Array); + + std::string NewName = (Array.getName() + Twine(".toptr")).str(); + GlobalVariable *ReplacementToArr = + cast(M.getOrInsertGlobal(NewName, ElemPtrTy)); + ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy)); + + Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M); + Twine FnName = Array.getName() + ".constructor"; + PollyIRBuilder Builder(M.getContext()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false); + const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + Function *F = Function::Create(Ty, Linkage, FnName, &M); + BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F); + Builder.SetInsertPoint(Start); + + int ArraySizeInt = DL.getTypeAllocSizeInBits(ArrayTy); + Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), ArraySizeInt); + ArraySize->setName("array.size"); + + Value *AllocatedMemRaw = + Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw"); + Value *AllocatedMemTyped = + Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed"); + Builder.CreateStore(AllocatedMemTyped, ReplacementToArr); + Builder.CreateRetVoid(); + + const int Priority = 0; + appendToGlobalCtors(M, F, Priority, ReplacementToArr); + + std::vector ArrayUserInstructions; + // Get all instructions that use array. We need to do this weird thing + // because `Constant`s that contain this array neeed to be expanded into + // instructions so that we can replace their parameters. `Constant`s cannot + // be edited easily, so we choose to convert all `Constant`s to + // `Instruction`s and handle all of the uses of `Array` uniformly. + for (Use &ArrayUse : Array.uses()) { + getContainingInstructions(ArrayUse.getUser(), ArrayUserInstructions); + } + + for (Instruction *UserOfArrayInst : ArrayUserInstructions) { + if (InstsToBeDeleted.count(UserOfArrayInst)) + continue; + + Builder.SetInsertPoint(UserOfArrayInst); + // ** -> * + Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load"); + // * -> [ty]* + Value *ArrPtrBitcasted = Builder.CreateBitCast( + ArrPtrLoaded, PointerType::get(ArrayTy, AddrSpace), "arrptr.bitcast"); + rewriteArrToPtr(UserOfArrayInst, &Array, ArrPtrLoaded, ArrPtrBitcasted, + Builder, InstsToBeDeleted); + } +} + +// We return all `allocas` that may need to be converted to a call to +// cudaMallocManaged. +void getAllocasToBeManaged(Function &F, std::set &Allocas) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + auto *Alloca = dyn_cast(&I); + if (!Alloca) + continue; + dbgs() << "Checking if " << *Alloca << "may be captured: "; + + if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false, + /* StoreCaptures */ true)) { + Allocas.insert(Alloca); + DEBUG(dbgs() << "YES (captured)\n"); + } else { + DEBUG(dbgs() << "NO (not captured)\n"); + } + } + } +} + +void rewriteAllocaAsManagedMemory(AllocaInst *Alloca, const DataLayout *DL) { + DEBUG(dbgs() << "rewriting: " << *Alloca << " to managed mem.\n"); + Module *M = Alloca->getModule(); + assert(M && "Alloca does not have a module"); + + Function *F = Alloca->getFunction(); + + // TODO: do not consider "scalar" allocas like int. + PollyIRBuilder Builder(M->getContext()); + Builder.SetInsertPoint(Alloca); + + Value *MallocManagedFn = GetOrCreatePollyMallocManaged(*Alloca->getModule()); + const int Size = DL->getTypeAllocSize(Alloca->getType()->getElementType()); + Value *SizeVal = ConstantInt::get(Builder.getInt64Ty(), Size); + Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal}); + Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType()); + + Bitcasted->takeName(Alloca); + Alloca->replaceAllUsesWith(Bitcasted); + Alloca->eraseFromParent(); + + assert(F && "Alloca has invalid function"); + for (BasicBlock &BB : *F) { + ReturnInst *Return = dyn_cast(BB.getTerminator()); + if (!Return) + continue; + Builder.SetInsertPoint(Return); + + Value *FreeManagedFn = GetOrCreatePollyFreeManaged(*M); + Builder.CreateCall(FreeManagedFn, {RawManagedMem}); + } +} + class ManagedMemoryRewritePass : public ModulePass { public: static char ID; GPUArch Architecture; GPURuntime Runtime; + const DataLayout *DL; + ManagedMemoryRewritePass() : ModulePass(ID) {} + virtual bool runOnModule(Module &M) { + DL = &M.getDataLayout(); + Function *Malloc = M.getFunction("malloc"); if (Malloc) { @@ -114,6 +363,33 @@ Free->eraseFromParent(); } + SmallPtrSet InstsToBeDeleted; + SmallPtrSet GlobalsToErase; + + for (GlobalVariable &Global : M.globals()) { + replaceGlobalArray(M, *DL, Global, GlobalsToErase, InstsToBeDeleted); + } + + for (Instruction *Inst : InstsToBeDeleted) { + Inst->eraseFromParent(); + } + // Erase all globals from the parent + for (GlobalVariable *G : GlobalsToErase) { + G->eraseFromParent(); + } + + // Rewrite allocas to cudaMallocs if we are asked to do so. + if (RewriteAllocas) { + std::set AllocasToBeManaged; + for (Function &F : M.functions()) { + getAllocasToBeManaged(F, AllocasToBeManaged); + } + + for (AllocaInst *Alloca : AllocasToBeManaged) { + rewriteAllocaAsManagedMemory(Alloca, DL); + } + } + return true; } }; Index: lib/Support/RegisterPasses.cpp =================================================================== --- lib/Support/RegisterPasses.cpp +++ lib/Support/RegisterPasses.cpp @@ -45,6 +45,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/SROA.h" #include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -349,6 +350,7 @@ if (Target == TARGET_HYBRID) { PM.add( polly::createPPCGCodeGenerationPass(GPUArchChoice, GPURuntimeChoice)); + // PM.add(llvm::createSROAPass()); PM.add(polly::createManagedMemoryRewritePassPass(GPUArchChoice, GPURuntimeChoice)); } Index: test/GPGPU/managed-memory-rewrite-alloca.ll =================================================================== --- /dev/null +++ test/GPGPU/managed-memory-rewrite-alloca.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -analyze -polly-process-unprofitable \ +; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_arr[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR-NOT: %arr = alloca [100 x i32] + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + + +define void @f() { +entry: + %arr = alloca [100 x i32] + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %arr, i64 0, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} Index: test/GPGPU/simple-managed-memory-rewrite.ll =================================================================== --- /dev/null +++ test/GPGPU/simple-managed-memory-rewrite.ll @@ -0,0 +1,72 @@ +; RUN: opt %loadPolly -analyze -polly-process-unprofitable \ +; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_A[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @A.constructor, i8* bitcast (i32** @A.toptr to i8*) }] + +; Check that we generate a constructor +; HOST-IR: define void @A.constructor() { +; HOST-IR-NEXT: entry: +; HOST-IR-NEXT: %mem.raw = call i8* @polly_mallocManaged(i64 3200) +; HOST-IR-NEXT: %mem.typed = bitcast i8* %mem.raw to i32* +; HOST-IR-NEXT: store i32* %mem.typed, i32** @A.toptr +; HOST-IR-NEXT: ret void +; HOST-IR-NEXT: } + +; HOST-IR-NOT: @A + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +@A = common global [100 x i32] zeroinitializer, align 16 + +define void @f() { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @A, i64 0, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -33,8 +33,8 @@ #include #include -static int DebugMode; -static int CacheMode; +static int DebugMode = 1; +static int CacheMode = 1; static PollyGPURuntime Runtime = RUNTIME_NONE; @@ -1455,6 +1455,7 @@ } void *polly_mallocManaged(size_t size) { + polly_initContextCUDA(); dump_function(); void *a; if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { @@ -1462,6 +1463,7 @@ exit(-1); } addManagedPtr(a); + return a; }