Index: lib/CodeGen/ManagedMemoryRewrite.cpp =================================================================== --- lib/CodeGen/ManagedMemoryRewrite.cpp +++ lib/CodeGen/ManagedMemoryRewrite.cpp @@ -1,5 +1,4 @@ -//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory. -//---===// +//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===// // // The LLVM Compiler Infrastructure // @@ -34,6 +33,7 @@ #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DerivedUser.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" @@ -43,14 +43,15 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + + +#define DEBUG_TYPE "polly-acc-rewrite-managed-memory" namespace { static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) { // TODO: should I allow this pass to be a standalone pass that // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); const char *Name = "polly_mallocManaged"; Function *F = M.getFunction(Name); @@ -70,9 +71,6 @@ static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) { // TODO: should I allow this pass to be a standalone pass that // doesn't care if PollyManagedMemory is enabled or not? - assert(PollyManagedMemory && - "One should only rewrite malloc & free to" - "polly_{malloc,free}Managed with managed memory enabled."); const char *Name = "polly_freeManaged"; Function *F = M.getFunction(Name); @@ -89,13 +87,286 @@ return F; } +// Expand the constant expression Cur using Builder. This will recursively +// expand `Cur` to arrive at a set of instructions. +// `Expands` is populated all the expanded instructions. +// NOTE: this simply `insert`s into Expands. +static void expandConstantExpr(ConstantExpr *Cur, + PollyIRBuilder &Builder, + Instruction *Parent, + int index, + std::set> &Expands) { + DEBUG(dbgs() << "\n\n\nExpanding: " << *Cur << "\n"; + dbgs() << "Parent: " << *Parent << "\n";); + assert(Cur && "invalid constant expression passed"); + std::vector> Replacements; + + Instruction *I = Cur->getAsInstruction(); + Expands.insert({Parent, I}); + Parent->setOperand(index, I); + + assert(I && "unable to convert ConstantExpr to Instruction"); + // I need instructions to be created before this. + Builder.SetInsertPoint(Parent); + Builder.Insert(I); + + DEBUG(dbgs() << "Expanded: " << *I << "\n";); + for (unsigned i = 0; i < Cur->getNumOperands(); i++) { + Constant *COp = dyn_cast(Cur->getOperand(i)); + assert(COp && "constant must have a constant operand"); + if (isa(COp)) { + expandConstantExpr(dyn_cast(COp), Builder, I, i, Expands); + }; + } +} + +// rewrite a GEP to strip of the first index +// We need to do this because earlier it used to be @[i32 x ] +// It is now i32*. We don't need an extra "@" dereference. +// Parameters: +// MaybeGEP: A `User` that might be a GEP +// ArrToRewrite: Global array we wish to rewrite to a pointer (@A) +// NewLoadedPtr: New pointer value to rewrite the global array with (A.toptr that has been loaded) +// IRBuilder: IRBuilder instance +/* +static bool rewriteGEP(Instruction *MaybeGEP, Instruction *Parent, Value *ArrToRewrite, Value *NewLoadedPtr, + PollyIRBuilder &IRBuilder, + std::set &InstsToBeDeleted ) { + DEBUG(dbgs() << "\n\n\n";); + DEBUG(dbgs() << "CurInst: " << *MaybeGEP << "\n";); + if (Parent) + DEBUG(dbgs() << "Owning Inst: " << *Parent << "\n";); + else + DEBUG(dbgs() << "Owning Inst: " << "NONE" << "\n";); + DEBUG(dbgs() << "TargerArr: " << *ArrToRewrite << "\n";); + GetElementPtrInst *GEP = dyn_cast(MaybeGEP); + + if (!GEP) + return false; + if (!(GEP->getPointerOperand() == ArrToRewrite)) + return false; + + DEBUG(dbgs() << " Is GEP\n";); + + auto Indices = GEP->indices(); + std::vector NewIndices(Indices.begin() + 1, Indices.end()); + + Value *NewGEP = IRBuilder.CreateGEP(NewLoadedPtr, NewIndices, "newgep"); + + // Either the owning instruction is a GEP, or is an instruction that + // contains a GEP. + if (Parent == nullptr) { + GEP->replaceAllUsesWith(NewGEP); + InstsToBeDeleted.insert(GEP); + } + else { + + if (GEP->getNumUses() == 1) { + DEBUG(dbgs() << "\n\n\n@@@ GEP dropping to 0" << *GEP << "\n"); + }; + DEBUG(dbgs() << "Replacing GEP(" << *GEP << ")\n\twith NewGEP(" << *NewGEP << ")\n\tin Parent(" << *Parent << ")...\n"); + Parent->replaceUsesOfWith(GEP, NewGEP); + DEBUG(dbgs() << "Parent after replacement: " << *Parent << "\n";); + DEBUG(dbgs() << "GEP->numUses() " << GEP->getNumUses() << "\n";); + + // GEP can be used by other people, so we can't remove it. + if (GEP->getNumUses() == 0) { + InstsToBeDeleted.insert(GEP); + DEBUG(dbgs() << "@@@ GEP AT 0: " << *GEP << "\n";); + } + } + return true; +} +*/ + +// Edit all uses of `ArrPtrToRewrite` to `NewLoadedPtr` in `Inst`. +// This will change all `GEP`s into `ArrPtrToRewrite` to `NewLoadedPtr`, re-indexing +// the GEPs correctly as well. +// It will change all raw uses of `ArrPtrToRewrite` to `NewBitcastedPtr`. +static void rewriteArrToPtr(Instruction *Inst, Value *ArrPtrToRewrite, Value *NewLoadedPtr, + Value *NewBitcastedPtr, PollyIRBuilder &Builder, + std::set&InstsToBeDeleted) { + + // We use a worklist based algorithm that keep the frontier of + // `User`s we need to rewrite in `Next`, and the current iterations + // in `Current`. + std::set> Next; + std::set> Current = {std::make_pair(nullptr, Inst)}; + + while (!Current.empty()) { + + for (const std::pair &ParentInstPair : Current) { + Instruction *CurInst = ParentInstPair.second; + Instruction *Parent = ParentInstPair.first; + + Builder.SetInsertPoint(CurInst); + // Try to rewrite the current as a GEP. + // If we can generate a GEP from the instruction, then we are done, + // because we have replaced the old array with the new pointer. + // if (rewriteGEP(CurInst, Parent, ArrPtrToRewrite, NewLoadedPtr, Builder, InstsToBeDeleted)) + // continue; + + for (unsigned i = 0; i < CurInst->getNumOperands(); i++) { + User *OperandAsUser = dyn_cast(CurInst->getOperand(i)); + + if (!OperandAsUser) { + errs() << "\t\t" << *OperandAsUser << " obtained from: " << *CurInst + << "is not a user!. Trying to replace: " + << *ArrPtrToRewrite << " with: " << *NewBitcastedPtr << "failed.\n"; + report_fatal_error("rewriteArrToPtr failed with value that was not user"); + } + assert(OperandAsUser && "operandAsUser uninitialized"); + + if (isa(OperandAsUser)) { + if (OperandAsUser == ArrPtrToRewrite) { + CurInst->setOperand(i, NewBitcastedPtr); + } + } else { + assert(isa(OperandAsUser)); + + if (isa(OperandAsUser) || + isa(OperandAsUser) || + isa(OperandAsUser)) + continue; + + if (isa(OperandAsUser)) { + if (OperandAsUser == ArrPtrToRewrite) { + CurInst->setOperand(i, NewBitcastedPtr); + } + continue; + } + + // Only things that can contain a reference is a ConstantExpr + ConstantExpr *ValueConstExpr = dyn_cast(OperandAsUser); + assert(ValueConstExpr && "this must be a ValueConstExpr"); + + expandConstantExpr(ValueConstExpr, Builder, CurInst, i, Next); + + } // end else + } // end operands for + } // end for current + + Current.clear(); + Current = Next; + + Next.clear(); + } // end worklist +} + +// Given a value `Current`, return all Instructions that may contain `Current` +// in an expression. +static void getContainingInstructions(Value *Current, + std::vector &Owners) { + Instruction *I; + Constant *C; + if ((I = dyn_cast(Current))) { + Owners.push_back(I); + } else if ((C = dyn_cast(Current))) { + for (Use &CUse : C->uses()) { + getContainingInstructions(CUse.getUser(), Owners); + } + } else { + errs() << "(" << *Current + << ") is neither an instruction nor a constant!.\n" + "The process of finding the owning instruction reached a node " + "with unknown replacement strategy"; + report_fatal_error("unable to find owning instruction"); + llvm_unreachable("should never reach here from getContainingInstruction"); + } +} + +static void RewriteGlobalArray(Module &M, const DataLayout &DL, + GlobalVariable &Array, + std::set &ReplacedGlobals, + std::set &InstsToBeDeleted) { + static const unsigned AddrSpace = 0; + // We only want arrays. + ArrayType *ArrayTy = dyn_cast(Array.getType()->getElementType()); + if (!ArrayTy) + return; + Type *ElemTy = ArrayTy->getElementType(); + PointerType *ElemPtrTy = PointerType::get(ElemTy, AddrSpace); + + // We only wish to replace stuff with internal linkage. Otherwise, + // our type edit from [T] to T* would be illegal across modules. + // It is interesting that most arrays don't seem to be tagged with internal + // linkage? + if (GlobalValue::isWeakForLinker(Array.getLinkage()) && false) { + return; + } + + if (!Array.hasInitializer() || + !isa(Array.getInitializer())) + return; + + // At this point, we have committed to replacing this array. + ReplacedGlobals.insert(&Array); + + std::string NewName = (Array.getName() + Twine(".toptr")).str(); + GlobalVariable *ReplacementToArr = + dyn_cast(M.getOrInsertGlobal(NewName, ElemPtrTy)); + ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy)); + + Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M); + Twine FnName = Array.getName() + ".constructor"; + PollyIRBuilder Builder(M.getContext()); + FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false); + const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; + Function *F = Function::Create(Ty, Linkage, FnName, &M); + BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F); + Builder.SetInsertPoint(Start); + + int ArraySizeInt = DL.getTypeAllocSizeInBits(ArrayTy); + Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), ArraySizeInt); + ArraySize->setName("array.size"); + + Value *AllocatedMemRaw = + Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw"); + Value *AllocatedMemTyped = + Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed"); + Builder.CreateStore(AllocatedMemTyped, ReplacementToArr); + Builder.CreateRetVoid(); + + // HACK: refactor the priority stuff. + static int priority = 0; + appendToGlobalCtors(M, F, priority++, ReplacementToArr); + + std::vector ArrayUserInstructions; + // Get all instructions that use array. We need to do this weird thing + // because `Constant`s that contain + for (Use &ArrayUse : Array.uses()) { + getContainingInstructions(ArrayUse.getUser(), ArrayUserInstructions); + } + + for (Instruction *UserOfArrayInst : ArrayUserInstructions) { + if (InstsToBeDeleted.count(UserOfArrayInst)) continue; + + Builder.SetInsertPoint(UserOfArrayInst); + // ** -> * + Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load"); + // * -> [ty]* + Value *ArrPtrBitcasted = Builder.CreateBitCast(ArrPtrLoaded, PointerType::get(ArrayTy, AddrSpace), "arrptr.bitcast"); + rewriteArrToPtr(UserOfArrayInst, &Array, ArrPtrLoaded, ArrPtrBitcasted, + Builder, InstsToBeDeleted); + } + +} + +void rewriteFunctionParameters(Function *F) { +} + class ManagedMemoryRewritePass : public ModulePass { public: static char ID; GPUArch Architecture; GPURuntime Runtime; + const DataLayout *DL; + ManagedMemoryRewritePass() : ModulePass(ID) {} + virtual bool runOnModule(Module &M) { + DL = &M.getDataLayout(); + Function *Malloc = M.getFunction("malloc"); if (Malloc) { @@ -114,6 +385,28 @@ Free->eraseFromParent(); } + std::setInstsToBeDeleted; + std::set GlobalsToErase; + + for (GlobalVariable &Global : M.globals()) { + RewriteGlobalArray(M, *DL, Global, GlobalsToErase, InstsToBeDeleted); + } + + DEBUG(dbgs() << "\n\n\n=====Module=====\n"; + M.dump(); + dbgs() << "=====\n";); + + for(Instruction *Inst : InstsToBeDeleted) { + DEBUG(dbgs() << "\n\nRemoving: " << *Inst << "...\n";); + Inst->eraseFromParent(); + DEBUG(dbgs() << "Successful\n";); + } + // Erase all globals from the parent + for(GlobalVariable *G : GlobalsToErase) { + G->eraseFromParent(); + } + + return true; } }; Index: test/GPGPU/simple-managed-memory-rewrite.ll =================================================================== --- /dev/null +++ test/GPGPU/simple-managed-memory-rewrite.ll @@ -0,0 +1,70 @@ +; RUN: opt %loadPolly -analyze -polly-process-unprofitable \ +; RUN: -polly-scops -polly-use-llvm-names < %s | FileCheck %s --check-prefix=SCOP + +; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \ +; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \ +; RUN: -polly-acc-rewrite-managed-memory < %s | FileCheck %s --check-prefix=HOST-IR + +; SCOP: Function: f +; SCOP-NEXT: Region: %for.body---%for.end +; SCOP-NEXT: Max Loop Depth: 1 +; SCOP: i32 MemRef_A[*]; + +; Check that we generate a constructor call for @A.toptr +; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @A.constructor, i8* bitcast (i32** @A.toptr to i8*) }] + +; Check that we generate a constructor +; HOST-IR: define void @A.constructor() { +; HOST-IR-NEXT: entry: +; HOST-IR-NEXT: %mem.raw = call i8* @polly_mallocManaged(i64 320000) +; HOST-IR-NEXT: %mem.typed = bitcast i8* %mem.raw to i32* +; HOST-IR-NEXT: store i32* %mem.typed, i32** @A.toptr +; HOST-IR-NEXT: ret void +; HOST-IR-NEXT: } + +; HOST-IR-NOT: @A + +source_filename = "test.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +@A = common global [100 x i32] zeroinitializer, align 16 + +define void @f() { +entry: + br label %entry.split + +entry.split: ; preds = %entry + br label %for.body + +for.body: ; preds = %entry.split, %for.body + %indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @A, i64 0, i64 %indvars.iv1 + store i32 42, i32* %arrayidx, align 4, !tbaa !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0 + + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0 + +attributes #0 = { argmemonly nounwind } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -33,8 +33,8 @@ #include #include -static int DebugMode; -static int CacheMode; +static int DebugMode = 1; +static int CacheMode = 1; static PollyGPURuntime Runtime = RUNTIME_NONE; @@ -1455,6 +1455,7 @@ } void *polly_mallocManaged(size_t size) { + polly_initContextCUDA(); dump_function(); void *a; if (cudaMallocManaged(&a, size, cudaMemAttachGlobal) != cudaSuccess) { @@ -1462,6 +1463,8 @@ exit(-1); } addManagedPtr(a); + + fprintf(stderr, "\n *** cudaMallocManaged return value: %p\n", a); return a; }