Index: polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp =================================================================== --- polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp +++ polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp @@ -1,5 +1,4 @@ -//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory. -//---===// +//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===// // // The LLVM Compiler Infrastructure // @@ -30,6 +29,7 @@ #include "polly/Support/SCEVValidator.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -43,14 +43,25 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +static cl::opt RewriteAllocas( + "polly-acc-rewrite-allocas", + cl::desc( + "Ask the managed memory rewriter to also rewrite alloca instructions"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +static cl::opt IgnoreLinkageForGlobals( + "polly-acc-rewrite-ignore-linkage-for-globals", + cl::desc( + "By default, we only rewrite globals with internal linkage. This flag " + "enables rewriting of globals regardless of linkage"), + cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory)); + +#define DEBUG_TYPE "polly-acc-rewrite-managed-memory" namespace { -static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) { - // TODO: should I allow this pass to be a standalone pass that - // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); +static llvm::Function *getOrCreatePollyMallocManaged(Module &M) { const char *Name = "polly_mallocManaged"; Function *F = M.getFunction(Name); @@ -67,12 +78,7 @@ return F; } -static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) { - // TODO: should I allow this pass to be a standalone pass that - // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); +static llvm::Function *getOrCreatePollyFreeManaged(Module &M) { const char *Name = "polly_freeManaged"; Function *F = M.getFunction(Name); @@ -89,17 +95,242 @@ return F; } +// Expand a constant expression `Cur`, which is used at instruction `Parent` +// at index `index`. +// Since a constant expression can expand to multiple instructions, store all +// the expands into a set called `Expands`. +// Note that this goes inorder on the constant expression tree. +// A * ((B * D) + C) +// will be processed with first A, then B * D, then B, then D, and then C. +// Though ConstantExprs are not treated as "trees" but as DAGs, since you can +// have something like this: +// * +// / \ +// \ / +// (D) +// +// For the purposes of this expansion, we expand the two occurences of D +// separately. Therefore, we expand the DAG into the tree: +// * +// / \ +// D D +// TODO: We don't _have_to do this, but this is the simplest solution. +// We can write a solution that keeps track of which constants have been +// already expanded. +static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder, + Instruction *Parent, int index, + SmallPtrSet &Expands) { + assert(Cur && "invalid constant expression passed"); + + Instruction *I = Cur->getAsInstruction(); + Expands.insert(I); + Parent->setOperand(index, I); + + assert(I && "unable to convert ConstantExpr to Instruction"); + // The things that `Parent` uses (its operands) should be created + // before `Parent`. + Builder.SetInsertPoint(Parent); + Builder.Insert(I); + + DEBUG(dbgs() << "Expanding ConstantExpression: " << *Cur + << " | in Instruction: " << *I << "\n";); + for (unsigned i = 0; i < Cur->getNumOperands(); i++) { + Value *Op = Cur->getOperand(i); + assert(isa(Op) && "constant must have a constant operand"); + + if (ConstantExpr *CExprOp = dyn_cast(Op)) + expandConstantExpr(CExprOp, Builder, I, i, Expands); + } +} + +// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite +// `ConstantExpr`s that are used in the `Inst`. +// Note that `replaceAllUsesWith` is insufficient for this purpose because it +// does not rewrite values in `ConstantExpr`s. +static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal, + PollyIRBuilder &Builder) { + + // This contains a set of instructions in which OldVal must be replaced. + // We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s + // from `Inst`s arguments. + // We need to go through this process because `replaceAllUsesWith` does not + // actually edit `ConstantExpr`s. + SmallPtrSet InstsToVisit = {Inst}; + + // Expand all `ConstantExpr`s and place it in `InstsToVisit`. + for (unsigned i = 0; i < Inst->getNumOperands(); i++) { + Value *Operand = Inst->getOperand(i); + if (ConstantExpr *ValueConstExpr = dyn_cast(Operand)) + expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit); + } + + // Now visit each instruction and use `replaceUsesOfWith`. We know that + // will work because `I` cannot have any `ConstantExpr` within it. + for (Instruction *I : InstsToVisit) + I->replaceUsesOfWith(OldVal, NewVal); +} + +// Given a value `Current`, return all Instructions that may contain `Current` +// in an expression. +// We need this auxiliary function, because if we have a +// `Constant` that is a user of `V`, we need to recurse into the +// `Constant`s uses to gather the root instruciton. +static void getInstructionUsersOfValue(Value *V, + SmallVector &Owners) { + if (auto *I = dyn_cast(V)) { + Owners.push_back(I); + } else { + // Anything that is a `User` must be a constant or an instruction. + auto *C = cast(V); + for (Use &CUse : C->uses()) + getInstructionUsersOfValue(CUse.getUser(), Owners); + } +} + +static void +replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array, + SmallPtrSet &ReplacedGlobals) { + // We only want arrays. + ArrayType *ArrayTy = dyn_cast(Array.getType()->getElementType()); + if (!ArrayTy) + return; + Type *ElemTy = ArrayTy->getElementType(); + PointerType *ElemPtrTy = ElemTy->getPointerTo(); + + // We only wish to replace arrays that are visible in the module they + // inhabit. Otherwise, our type edit from [T] to T* would be illegal across + // modules. + const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() || + Array.hasInternalLinkage() || + IgnoreLinkageForGlobals; + if (!OnlyVisibleInsideModule) + return; + + if (!Array.hasInitializer() || + !isa(Array.getInitializer())) + return; + + // At this point, we have committed to replacing this array. + ReplacedGlobals.insert(&Array); + + std::string NewName = (Array.getName() + Twine(".toptr")).str(); + GlobalVariable *ReplacementToArr = + cast(M.getOrInsertGlobal(NewName, ElemPtrTy)); + ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy)); + + Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); + Twine FnName = Array.getName() + ".constructor"; + PollyIRBuilder Builder(M.getContext()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false); + const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + Function *F = Function::Create(Ty, Linkage, FnName, &M); + BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F); + Builder.SetInsertPoint(Start); + + int ArraySizeInt = DL.getTypeAllocSizeInBits(ArrayTy) / 8; + Value *ArraySize = Builder.getInt64(ArraySizeInt); + ArraySize->setName("array.size"); + + Value *AllocatedMemRaw = + Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw"); + Value *AllocatedMemTyped = + Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed"); + Builder.CreateStore(AllocatedMemTyped, ReplacementToArr); + Builder.CreateRetVoid(); + + const int Priority = 0; + appendToGlobalCtors(M, F, Priority, ReplacementToArr); + + SmallVector ArrayUserInstructions; + // Get all instructions that use array. We need to do this weird thing + // because `Constant`s that contain this array neeed to be expanded into + // instructions so that we can replace their parameters. `Constant`s cannot + // be edited easily, so we choose to convert all `Constant`s to + // `Instruction`s and handle all of the uses of `Array` uniformly. + for (Use &ArrayUse : Array.uses()) + getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions); + + for (Instruction *UserOfArrayInst : ArrayUserInstructions) { + + Builder.SetInsertPoint(UserOfArrayInst); + // ** -> * + Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load"); + // * -> [ty]* + Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast( + ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast"); + rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder); + } +} + +// We return all `allocas` that may need to be converted to a call to +// cudaMallocManaged. +static void getAllocasToBeManaged(Function &F, + SmallSet &Allocas) { + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + auto *Alloca = dyn_cast(&I); + if (!Alloca) + continue; + dbgs() << "Checking if " << *Alloca << "may be captured: "; + + if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false, + /* StoreCaptures */ true)) { + Allocas.insert(Alloca); + DEBUG(dbgs() << "YES (captured)\n"); + } else { + DEBUG(dbgs() << "NO (not captured)\n"); + } + } + } +} + +static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca, + const DataLayout &DL) { + DEBUG(dbgs() << "rewriting: " << *Alloca << " to managed mem.\n"); + Module *M = Alloca->getModule(); + assert(M && "Alloca does not have a module"); + + PollyIRBuilder Builder(M->getContext()); + Builder.SetInsertPoint(Alloca); + + Value *MallocManagedFn = getOrCreatePollyMallocManaged(*Alloca->getModule()); + const int Size = DL.getTypeAllocSize(Alloca->getType()->getElementType()); + Value *SizeVal = Builder.getInt64(Size); + Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal}); + Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType()); + + Function *F = Alloca->getFunction(); + assert(F && "Alloca has invalid function"); + + Bitcasted->takeName(Alloca); + Alloca->replaceAllUsesWith(Bitcasted); + Alloca->eraseFromParent(); + + for (BasicBlock &BB : *F) { + ReturnInst *Return = dyn_cast(BB.getTerminator()); + if (!Return) + continue; + Builder.SetInsertPoint(Return); + + Value *FreeManagedFn = getOrCreatePollyFreeManaged(*M); + Builder.CreateCall(FreeManagedFn, {RawManagedMem}); + } +} + class ManagedMemoryRewritePass : public ModulePass { public: static char ID; GPUArch Architecture; GPURuntime Runtime; + ManagedMemoryRewritePass() : ModulePass(ID) {} virtual bool runOnModule(Module &M) { + const DataLayout &DL = M.getDataLayout(); + Function *Malloc = M.getFunction("malloc"); if (Malloc) { - Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M); + Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M); assert(PollyMallocManaged && "unable to create polly_mallocManaged"); Malloc->replaceAllUsesWith(PollyMallocManaged); Malloc->eraseFromParent(); @@ -108,12 +339,28 @@ Function *Free = M.getFunction("free"); if (Free) { - Function *PollyFreeManaged = GetOrCreatePollyFreeManaged(M); + Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M); assert(PollyFreeManaged && "unable to create polly_freeManaged"); Free->replaceAllUsesWith(PollyFreeManaged); Free->eraseFromParent(); } + SmallPtrSet GlobalsToErase; + for (GlobalVariable &Global : M.globals()) + replaceGlobalArray(M, DL, Global, GlobalsToErase); + for (GlobalVariable *G : GlobalsToErase) + G->eraseFromParent(); + + // Rewrite allocas to cudaMallocs if we are asked to do so. + if (RewriteAllocas) { + SmallSet AllocasToBeManaged; + for (Function &F : M.functions()) + getAllocasToBeManaged(F, AllocasToBeManaged); + + for (AllocaInst *Alloca : AllocasToBeManaged) + rewriteAllocaAsManagedMemory(Alloca, DL); + } + return true; } }; Index: polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll =================================================================== --- polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll +++ polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll @@ -0,0 +1,61 @@ +; RUN: opt %loadPolly -analyze -polly-process-unprofitable \ +; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_arr[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR-NOT: %arr = alloca [100 x i32] + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + + +define void @f() { +entry: + %arr = alloca [100 x i32] + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %arr, i64 0, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} Index: polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll =================================================================== --- polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll +++ polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll @@ -0,0 +1,73 @@ +; RUN: opt %loadPolly -analyze -polly-process-unprofitable \ +; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR + +; REQUIRES: pollyacc + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_A[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.constructor, i8* bitcast (i32** @A.toptr to i8*) }] + +; Check that we generate a constructor +; 4 bytes * 100 = 400 +; HOST-IR: define void @.constructor() { +; HOST-IR-NEXT: entry: +; HOST-IR-NEXT: %mem.raw = call i8* @polly_mallocManaged(i64 400) +; HOST-IR-NEXT: %mem.typed = bitcast i8* %mem.raw to i32* +; HOST-IR-NEXT: store i32* %mem.typed, i32** @A.toptr +; HOST-IR-NEXT: ret void +; HOST-IR-NEXT: } + +; HOST-IR-NOT: @A + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +@A = internal global [100 x i32] zeroinitializer, align 16 + +define void @f() { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @A, i64 0, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"}