This is an archive of the discontinued LLVM Phabricator instance.

lib/CodeGen/ManagedMemoryRewrite.cpp
125 ↗	(On Diff #110434)	Format? Is this more than 80 lines? In general this does not seem to be run through clang format?
145 ↗	(On Diff #110434)	Unrelated change?

This is slightly harder than I initially thought, because,

mutateType on the original global is entirely the wrong way to go about this. It leaves the IR in some inconsistent state.

The right solution (I believe) is to create a replacement global (say A.repl), set it up correctly, visit each use site of the original (say A), and change it to the replacement (A.repl)

The problem is that we can't insert an IRBuilder at a Use to call bitcast. We need an Instruction. But the underlying Value could be a Constant or something, and I can't see a way to extract a location for a builder from a Value. I'm probably missing something in the API, any help appreciated.

Right now, this doesn't actually replace, though it does create a separate A.repl and initialises it (I can see the call to cudaMallocManaged with nvprof).

[WIP] [Polly] [ManagedMemoryRewrite] Rewrite global arrays to pointers that are allocated with managed memory.

@grosser I'm sorry, I seem to be hitting some Phab bug that updates D36516 rather than creating a new revision. The original D36516 was an NFC change. I tried running another arc patch with the same effect (you can see the bollu updated this revision to Diff 110436.). Something super weird is happening.

Anyway, let's discuss the proposed change here. I'll move the [NFC] patch to another revision.

lib/CodeGen/ManagedMemoryRewrite.cpp
125 ↗	(On Diff #110434)	Yes, hence `WIP`. I wished to discuss how to do this correctly. Could you `request changes`, because this patch should not be `LGTM`d. Please see my comment on the patch about the discussion I wish to have.

bollu retitled this revision from [NFC] [ManagedMemoryRewrite] [Polly] Erase original malloc and free to [WIP] [Polly] [ManagedMemoryRewrite] Rewrite global arrays with global pointers that are polly_mallocManage'd.Aug 9 2017, 10:52 AM

bollu edited the summary of this revision. (Show Details)

Do you have some other uncommitted phabricator patches in your branch?

@grosser: could you change your status to "Requesting changes"? That way, you'll see when I push

grosser requested changes to this revision.Aug 10 2017, 1:40 AM

This revision now requires changes to proceed.Aug 10 2017, 1:40 AM

[WIP] not sure why this doesn't work, code seems reasonable

Example of changes induced by this patch on a .ll file

source.c

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime.h>
int A[5];
void f(int *arr) {
    for(int i = 0; i < 5; i++) arr[i] = 10;
}


#define CHECK(r) {_check((r), __LINE__);}

void _check(cudaError_t r, int line) {
    if (r != cudaSuccess) {
        printf("CUDA error on line %d: %s\n", line, cudaGetErrorString(r));
        exit(0);
    }
}

void print_arr(int *A) {
    for(int i = 0; i < 5; i++)
        printf("%d := %d\n", i, A[i]);
}


int main() {
    for(int i = 0; i < 5; i++) { A[i] = -42; }
    printf("A before kernel:\n");
    print_arr(A);


    printf("launching kernel...\n");
    f(A);

    printf("printing A...\n");
    print_arr(A);
    return 0;
}

unoptimised.ll

; ModuleID = 'program.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@.str = private unnamed_addr constant [27 x i8] c"CUDA error on line %d: %s\0A\00", align 1
@.str.1 = private unnamed_addr constant [10 x i8] c"%d := %d\0A\00", align 1
@A = common global [5 x i32] zeroinitializer, align 16
@.str.2 = private unnamed_addr constant [18 x i8] c"A before kernel:\0A\00", align 1
@.str.3 = private unnamed_addr constant [21 x i8] c"launching kernel...\0A\00", align 1
@.str.4 = private unnamed_addr constant [15 x i8] c"printing A...\0A\00", align 1
@str = private unnamed_addr constant [17 x i8] c"A before kernel:\00"
@str.1 = private unnamed_addr constant [20 x i8] c"launching kernel...\00"
@str.2 = private unnamed_addr constant [14 x i8] c"printing A...\00"

define void @f(i32* %arr) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  store i32 10, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

define void @_check(i32 %r, i32 %line) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %cmp = icmp eq i32 %r, 0
  br i1 %cmp, label %if.end, label %if.then

if.then:                                          ; preds = %entry.split
  %call = tail call i8* @cudaGetErrorString(i32 %r)
  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str, i64 0, i64 0), i32 %line, i8* %call)
  tail call void @exit(i32 0)
  unreachable

if.end:                                           ; preds = %entry.split
  ret void
}

declare i32 @printf(i8*, ...)

declare i8* @cudaGetErrorString(i32)

declare void @exit(i32)

define void @print_arr(i32* %A) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  %tmp4 = load i32, i32* %arrayidx, align 4
  %0 = trunc i64 %indvars.iv to i32
  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.1, i64 0, i64 0), i32 %0, i32 %tmp4)
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

define i32 @main() {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %entry.split, %for.body
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @A, i64 0, i64 %indvars.iv
  store i32 -42, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @str, i64 0, i64 0))
  tail call void @print_arr(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @A, i64 0, i64 0))
  %puts1 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str.1, i64 0, i64 0))
  tail call void @f(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @A, i64 0, i64 0))
  %puts2 = tail call i32 @puts(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @str.2, i64 0, i64 0))
  tail call void @print_arr(i32* getelementptr inbounds ([5 x i32], [5 x i32]* @A, i64 0, i64 0))
  ret i32 0
}

; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0

attributes #0 = { nounwind }

Optimised `.ll` file

Note that all uses of @A have now become @A.toptr
Also notice that wherever we use @A.toptr, we bitcast it from i32 ** to [i32 x 5]*, which is a legal bitcast (?)

You can run the output program through opt, it passes verify module so there's nothing grossly wrong with it.
Is the bitcast incorrect? Am I missing something obvious?

optimised.ll

; ModuleID = 'program.canonical.ll'
source_filename = "program.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

@.str = private unnamed_addr constant [27 x i8] c"CUDA error on line %d: %s\0A\00", align 1
@.str.1 = private unnamed_addr constant [10 x i8] c"%d := %d\0A\00", align 1
@A = common global [5 x i32] zeroinitializer, align 16
@.str.2 = private unnamed_addr constant [18 x i8] c"A before kernel:\0A\00", align 1
@.str.3 = private unnamed_addr constant [21 x i8] c"launching kernel...\0A\00", align 1
@.str.4 = private unnamed_addr constant [15 x i8] c"printing A...\0A\00", align 1
@str = private unnamed_addr constant [17 x i8] c"A before kernel:\00"
@str.1 = private unnamed_addr constant [20 x i8] c"launching kernel...\00"
@str.2 = private unnamed_addr constant [14 x i8] c"printing A...\00"
@FUNC_f_SCOP_0_KERNEL_0 = private unnamed_addr constant [462 x i8] c"//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 3.2\0A.target sm_30\0A.address_size 64\0A\0A\09// .globl\09FUNC_f_SCOP_0_KERNEL_0\0A\0A.visible .entry FUNC_f_SCOP_0_KERNEL_0(\0A\09.param .u64 FUNC_f_SCOP_0_KERNEL_0_param_0\0A)\0A.maxntid 5, 1, 1\0A{\0A\09.reg .b32 \09%r<3>;\0A\09.reg .b64 \09%rd<4>;\0A\0A\09ld.param.u64 \09%rd1, [FUNC_f_SCOP_0_KERNEL_0_param_0];\0A\09mov.u32 \09%r1, %tid.x;\0A\09mul.wide.u32 \09%rd2, %r1, 4;\0A\09add.s64 \09%rd3, %rd1, %rd2;\0A\09mov.u32 \09%r2, 10;\0A\09st.global.u32 \09[%rd3], %r2;\0A\09ret;\0A}\0A\0A\0A\00"
@FUNC_f_SCOP_0_KERNEL_0_name = private unnamed_addr constant [23 x i8] c"FUNC_f_SCOP_0_KERNEL_0\00"
@A.toptr = global i32* null
@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @A.constructor, i8* bitcast (i32** @A.toptr to i8*) }]

define void @f(i32* %arr) {
entry:
  %polly_launch_0_params = alloca [2 x i8*]
  %polly_launch_0_param_0 = alloca i8*
  %polly_launch_0_param_size_0 = alloca i32
  %polly_launch_0_params_i8ptr = bitcast [2 x i8*]* %polly_launch_0_params to i8*
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %polly.split_new_and_old

polly.split_new_and_old:                          ; preds = %entry.split
  %0 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 1, i64 5)
  %.obit = extractvalue { i64, i1 } %0, 1
  %polly.overflow.state = or i1 false, %.obit
  %.res = extractvalue { i64, i1 } %0, 0
  %1 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 6, i64 %.res)
  %.obit1 = extractvalue { i64, i1 } %1, 1
  %polly.overflow.state2 = or i1 %polly.overflow.state, %.obit1
  %.res3 = extractvalue { i64, i1 } %1, 0
  %2 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 %.res3)
  %.obit4 = extractvalue { i64, i1 } %2, 1
  %polly.overflow.state5 = or i1 %polly.overflow.state2, %.obit4
  %.res6 = extractvalue { i64, i1 } %2, 0
  %3 = icmp sge i64 %.res6, 0
  %4 = and i1 true, %3
  %polly.rtc.overflown = xor i1 %polly.overflow.state5, true
  %polly.rtc.result = and i1 %4, %polly.rtc.overflown
  br i1 %polly.rtc.result, label %polly.start, label %for.body.pre_entry_bb

for.body.pre_entry_bb:                            ; preds = %polly.split_new_and_old
  br label %for.body

for.body:                                         ; preds = %for.body.pre_entry_bb, %for.body
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.pre_entry_bb ]
  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
  store i32 10, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %polly.merge_new_and_old

polly.merge_new_and_old:                          ; preds = %polly.exiting, %for.body
  br label %for.end

for.end:                                          ; preds = %polly.merge_new_and_old
  ret void

polly.start:                                      ; preds = %polly.split_new_and_old
  br label %polly.acc.initialize

polly.acc.initialize:                             ; preds = %polly.start
  %5 = call i8* @polly_initContextCUDA()
  %6 = bitcast i32* %arr to i8*
  %7 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
  store i8* %6, i8** %polly_launch_0_param_0
  %8 = bitcast i8** %polly_launch_0_param_0 to i8*
  store i8* %8, i8** %7
  store i32 4, i32* %polly_launch_0_param_size_0
  %9 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
  %10 = bitcast i32* %polly_launch_0_param_size_0 to i8*
  store i8* %10, i8** %9
  %11 = call i8* @polly_getKernel(i8* getelementptr inbounds ([462 x i8], [462 x i8]* @FUNC_f_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([23 x i8], [23 x i8]* @FUNC_f_SCOP_0_KERNEL_0_name, i32 0, i32 0))
  call void @polly_launchKernel(i8* %11, i32 1, i32 1, i32 5, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
  call void @polly_freeKernel(i8* %11)
  call void @polly_synchronizeDevice()
  call void @polly_freeContext(i8* %5)
  br label %polly.exiting

polly.exiting:                                    ; preds = %polly.acc.initialize
  br label %polly.merge_new_and_old
}

define void @_check(i32 %r, i32 %line) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  %cmp = icmp eq i32 %r, 0
  br i1 %cmp, label %if.end, label %if.then

if.then:                                          ; preds = %entry.split
  %call = tail call i8* @cudaGetErrorString(i32 %r)
  %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str, i64 0, i64 0), i32 %line, i8* %call)
  tail call void @exit(i32 0)
  unreachable

if.end:                                           ; preds = %entry.split
  ret void
}

declare i32 @printf(i8*, ...)

declare i8* @cudaGetErrorString(i32)

declare void @exit(i32)

define void @print_arr(i32* %A) {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %for.body, %entry.split
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  %tmp4 = load i32, i32* %arrayidx, align 4
  %0 = trunc i64 %indvars.iv to i32
  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.1, i64 0, i64 0), i32 %0, i32 %tmp4)
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  ret void
}

define i32 @main() {
entry:
  br label %entry.split

entry.split:                                      ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %for.body, %entry.split
  %indvars.iv = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* bitcast (i32** @A.toptr to [5 x i32]*), i64 0, i64 %indvars.iv
  store i32 -42, i32* %arrayidx, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond = icmp ne i64 %indvars.iv.next, 5
  br i1 %exitcond, label %for.body, label %for.end

for.end:                                          ; preds = %for.body
  %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @str, i64 0, i64 0))
  tail call void @print_arr(i32* getelementptr inbounds ([5 x i32], [5 x i32]* bitcast (i32** @A.toptr to [5 x i32]*), i64 0, i64 0))
  %puts1 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str.1, i64 0, i64 0))
  tail call void @f(i32* getelementptr inbounds ([5 x i32], [5 x i32]* bitcast (i32** @A.toptr to [5 x i32]*), i64 0, i64 0))
  %puts2 = tail call i32 @puts(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @str.2, i64 0, i64 0))
  tail call void @print_arr(i32* getelementptr inbounds ([5 x i32], [5 x i32]* bitcast (i32** @A.toptr to [5 x i32]*), i64 0, i64 0))
  ret i32 0
}

; Function Attrs: nounwind
declare i32 @puts(i8* nocapture readonly) #0

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) #1

; Function Attrs: nounwind readnone speculatable
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1

declare i8* @polly_initContextCUDA()

declare i8* @polly_getKernel(i8*, i8*)

declare void @polly_launchKernel(i8*, i32, i32, i32, i32, i32, i8*)

declare void @polly_freeKernel(i8*)

declare void @polly_synchronizeDevice()

declare void @polly_freeContext(i8*)

declare i8* @polly_mallocManaged(i64)

define void @A.constructor() {
entry:
  %mem.raw = call i8* @polly_mallocManaged(i64 16000)
  %mem.typed = bitcast i8* %mem.raw to i32*
  store i32* %mem.typed, i32** @A.toptr
  ret void
}

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }

lib/CodeGen/ManagedMemoryRewrite.cpp
137 ↗	(On Diff #110543)	I believe this bitcast to be correct. Am I doing something "obviously" wrong here?
140 ↗	(On Diff #110543)	For some reason, this fails. I suspect this indicates some deeper bug in my code.
tools/GPURuntime/GPUJIT.c
36 ↗	(On Diff #110543)	This is just a debugging change, I'll remove it at the final commit.

[WIP] version now works on test code, going to run it on COSMO to detect bugs in implementation.

Harbormaster completed remote builds in B9222: Diff 110693.Aug 11 2017, 5:12 AM

Meinersbur edited edge metadata.Aug 12 2017, 3:58 PM

The function editAllUses look more complicated than necessary. Here is what I'd try:

For every function, insert a arrptr.load in the entry (or lazily when needed) (might be beneficial to have just one such load per function)

Iterate of the instructions that contain a use of GlobalArray.

Recurse over the operands of the instruction. Pseudo code:

Value *recursiveReplace(Value *Replacee, GlobalArray*Old, LoadInst *New) {
  if (Value == Old)
    return New;

  Instuction *Replacement = dyn_cast<Instruction>(Replacee); // Replacee is only an instruction for the recursion root.
  for (Use &Op : Value->operands()) {
    auto Replacer= recursiveReplace(Op>get());
    if (Replacer== Op->get())
     continue;
    if (!Replacement) {
     Replacement = Replacee->getAsInstruction();
     // Insert Replacement somewhere, e.g. right after arrptr.load (It was previously a constant, i.e. there are no other dependencies than to arrptr.load)
   }
    Replacement->setOperand(Op.getOperandNo(),  Replacer);
  }
  return Replacement;
}

Call

for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
  auto New = recursiveReplace(UserOfArrayInst , Array, `arrptr.load`);
  if (New != UserOfArrayInst)
    UserOfArrayInst->replaceAllUsesWith(New);
}

lib/CodeGen/ManagedMemoryRewrite.cpp
100 ↗	(On Diff #110693)	[Suggestion] `SmallVector` instead of `std::vector`. Most instructions have only one or two operands, such that we don't need a memory location in the majority of cases.
120–122 ↗	(On Diff #110693)	[Style] Doxygen comment.
128 ↗	(On Diff #110693)	[Style] `GEP->getPointerOperand() != ArrToRewrite`
132 ↗	(On Diff #110693)	[Suggestion] `SmallVector` and `.reserve()`
143–145 ↗	(On Diff #110693)	[Suggestion] `std::set` is inefficient. Try `SmallPtrSet`?
151–152 ↗	(On Diff #110693)	Why does GEP need to be handled different than other instructions? There is also `GetElementPtrConstantExpr` which is not handled here.
159–165 ↗	(On Diff #110693)	An `assert` is enough.
222 ↗	(On Diff #110693)	[Style] if (auto *I = dyn_cast<Instruction>(Current))
230–235 ↗	(On Diff #110693)	AFAIK there is nothing else than Instruction and Constant deriving from User. An `assert(isa<Constant>(Current))` or `auto *C = cast<Constant>(Current)` would be enough.
239 ↗	(On Diff #110693)	Isn't it more replacing than removing? Also: functions start with lower-case letters
247 ↗	(On Diff #110693)	What does this return for multidimension arrays?
258 ↗	(On Diff #110693)	Uninitialized data would be ok as well, no? Could handle them the same as zero-initialized data, which is a value uninitialized data can have.
267 ↗	(On Diff #110693)	[Style] `cast<>` instead of `dyn_cast<>` gives you a type check in asset-builds.
279 ↗	(On Diff #110693)	Where does the magic number `100` come from?
291–292 ↗	(On Diff #110693)	Is the priority even important? AFAIK it can be just 0 for everything.
297 ↗	(On Diff #110693)	unfinished sentence
test/GPGPU/simple-managed-memory-rewrite.ll
7 ↗	(On Diff #110693)	`REQUIRES: pollyacc` missing

[Code dump] Dump of all changes that now allow global arrays to be rewritten

correctly.

We still need to rewrite allocas correctly.

[UNDEBUG] remove debug printing in GPUJIT
[NFC] check polly and fix test case to reduce size by x100

bollu marked 13 inline comments as done.Aug 15 2017, 2:00 AM

bollu added inline comments.

lib/CodeGen/ManagedMemoryRewrite.cpp
128 ↗	(On Diff #110693)	`rewriteGEP` has been removed.
132 ↗	(On Diff #110693)	`rewriteGEP` has been removed.
151–152 ↗	(On Diff #110693)	Yep, `GEP` special casing is removed.
247 ↗	(On Diff #110693)	It would return the `inner` array type. However, I don't believe this is a problem due to the way we allocate memory. We issue a single `cudaMallocManaged`, and then we convert the `[T]` to `T*`. This should still work, memory-layout wise unless I'm missing something.
258 ↗	(On Diff #110693)	Yes, it would. I set it to zero as a safe default. Do we gain / lose anything by switching to uninitialized?
279 ↗	(On Diff #110693)	Removed.

[WIP] Update according to Michael's comments.

[NFC] undo all changes to GPUJIT.

@Meinersbur - Second round of review, please :)

LGTM.

lib/CodeGen/ManagedMemoryRewrite.cpp
98 ↗	(On Diff #111198)	is populated WITH all the
123 ↗	(On Diff #111198)	No braces and no ";" Maybe also reduce casting.
127 ↗	(On Diff #111198)	You don't rewrite GEP, so remove the comment.
136 ↗	(On Diff #111198)	keeps
153 ↗	(On Diff #111198)	No braces for single statements
166 ↗	(On Diff #111198)	No braces for single-statements
218 ↗	(On Diff #111198)	Why is there a "false"?
310 ↗	(On Diff #111198)	Not needed any more.
lib/Support/RegisterPasses.cpp
48 ↗	(On Diff #111198)	Drop.
353 ↗	(On Diff #111198)	Unrelated change?

This revision is now accepted and ready to land.Aug 16 2017, 2:40 AM

The idea seems sensible. Some nits inline. Most importantly the intent and algorithm of the pass should be documented much more explicitely.

lib/CodeGen/ManagedMemoryRewrite.cpp
37 ↗	(On Diff #111198)	Unused include?
58 ↗	(On Diff #111198)	Function names should be humbleCamelCase
100 ↗	(On Diff #111198)	This algorithm requires a much more extensive documentation.
106 ↗	(On Diff #111198)	This is unused?
134 ↗	(On Diff #111198)	InstsToBeDeleted is unused?
190 ↗	(On Diff #111198)	Use SmalVvector instead of std::vector
206 ↗	(On Diff #111198)	What's the purpose of this?
212 ↗	(On Diff #111198)	`ElemTy->getPointerTo(AddrSpace)`
237 ↗	(On Diff #111198)	No need to pass the empty array here.
244 ↗	(On Diff #111198)	Isn't this off by a factor of 8?
257 ↗	(On Diff #111198)	Use a SmallVector instead
263 ↗	(On Diff #111198)	Superfluous braces.
268 ↗	(On Diff #111198)	`InstsToBeDeleted` is never populated with anything.
284 ↗	(On Diff #111198)	static
303 ↗	(On Diff #111198)	static
316 ↗	(On Diff #111198)	Just `Builder.getInt64(Size)`
324 ↗	(On Diff #111198)	Is this actually ever relevant?
341 ↗	(On Diff #111198)	Why is this a member?
369 ↗	(On Diff #111198)	Here and below: Superfluous braces
383 ↗	(On Diff #111198)	Use a SmallSet instead.

[NFC] Rewrite based on Phillip & Tobias' comments

[Bugfix] Send bytes, not bits. Also, move the alloca function extraction to the correct place.

Harbormaster completed remote builds in B9353: Diff 111475.Aug 17 2017, 1:28 AM

bollu added inline comments.Aug 17 2017, 1:28 AM

lib/CodeGen/ManagedMemoryRewrite.cpp
206 ↗	(On Diff #111198)	The `AddrSpace` is a parameter to all pointers. I believe the default address space is zero. As for why LLVM uses it, for example, the NVPTX backend uses address spaces to refer to where the pointer lives in memory.
218 ↗	(On Diff #111198)	I need some help with this one, actually. What is the correct linkage type?
324 ↗	(On Diff #111198)	Probably not, but I'd much rather be defensive about things like this.

@philip.pfaffe Another round of review, please?

[Re-upload] diff against the newest HEAD.

Harbormaster completed remote builds in B9354: Diff 111476.Aug 17 2017, 1:32 AM

[Merge] Merged with master, hoping that GPUJIT does not show up from arc diff this time.

Harbormaster completed remote builds in B9355: Diff 111478.Aug 17 2017, 1:36 AM

philip.pfaffe added inline comments.Aug 17 2017, 1:58 AM

lib/CodeGen/ManagedMemoryRewrite.cpp
122 ↗	(On Diff #111478)	Should you not assert before using I above?
149 ↗	(On Diff #111478)	Does this actually ever happen? You're recursively expanding ConstExpr operands, so after a single pass that shouldn't be necessary anymore. Does this mean that this loop's trip count is actually never greater than two?
219 ↗	(On Diff #111478)	Internal or Common linkage should be used.
206 ↗	(On Diff #111198)	The question was not what an `AddrSpace` is, but why you're actively passing the default everywhere, and even store this in an extra variable.

[NFC] Discuss algorithm with phillip offline, he helped to simplify it further.
[Linkage] Update linkage code to use the correct linker options as well as the ignore linkage flag. Update test case to match this change
[ReplaceUsesOfWith] remove double-loop that was not required.

Harbormaster completed remote builds in B9357: Diff 111484.Aug 17 2017, 2:34 AM

Besides the AddrSpace, LGTM!

[NFC] remove 0 address space because that is the default value.

Harbormaster completed remote builds in B9360: Diff 111488.Aug 17 2017, 3:41 AM

Closed by commit rL311080: [ManagedMemoryRewrite] Learn how to rewrite global arrays, allocas. (authored by bollu). · Explain WhyAug 17 2017, 4:23 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

polly/

trunk/

lib/

CodeGen/

ManagedMemoryRewrite.cpp

279 lines

test/

GPGPU/

managed-memory-rewrite-alloca.ll

61 lines

simple-managed-memory-rewrite.ll

73 lines

Diff 111495

polly/trunk/lib/CodeGen/ManagedMemoryRewrite.cpp

	//===------ ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory.			//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
	//---===//
	//			//
	// The LLVM Compiler Infrastructure			// The LLVM Compiler Infrastructure
	//			//
	// This file is distributed under the University of Illinois Open Source			// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.			// License. See LICENSE.TXT for details.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
	Show All 14 Lines
	#include "polly/DependenceInfo.h"			#include "polly/DependenceInfo.h"
	#include "polly/LinkAllPasses.h"			#include "polly/LinkAllPasses.h"
	#include "polly/Options.h"			#include "polly/Options.h"
	#include "polly/ScopDetection.h"			#include "polly/ScopDetection.h"
	#include "polly/ScopInfo.h"			#include "polly/ScopInfo.h"
	#include "polly/Support/SCEVValidator.h"			#include "polly/Support/SCEVValidator.h"
	#include "llvm/Analysis/AliasAnalysis.h"			#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/BasicAliasAnalysis.h"			#include "llvm/Analysis/BasicAliasAnalysis.h"
				#include "llvm/Analysis/CaptureTracking.h"
	#include "llvm/Analysis/GlobalsModRef.h"			#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"			#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"			#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/TargetTransformInfo.h"			#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/IR/LegacyPassManager.h"			#include "llvm/IR/LegacyPassManager.h"
	#include "llvm/IR/Verifier.h"			#include "llvm/IR/Verifier.h"
	#include "llvm/IRReader/IRReader.h"			#include "llvm/IRReader/IRReader.h"
	#include "llvm/Linker/Linker.h"			#include "llvm/Linker/Linker.h"
	#include "llvm/Support/TargetRegistry.h"			#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/TargetSelect.h"			#include "llvm/Support/TargetSelect.h"
	#include "llvm/Target/TargetMachine.h"			#include "llvm/Target/TargetMachine.h"
	#include "llvm/Transforms/IPO/PassManagerBuilder.h"			#include "llvm/Transforms/IPO/PassManagerBuilder.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"			#include "llvm/Transforms/Utils/BasicBlockUtils.h"
				#include "llvm/Transforms/Utils/ModuleUtils.h"

				static cl::opt<bool> RewriteAllocas(
				"polly-acc-rewrite-allocas",
				cl::desc(
				"Ask the managed memory rewriter to also rewrite alloca instructions"),
				cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

				static cl::opt<bool> IgnoreLinkageForGlobals(
				"polly-acc-rewrite-ignore-linkage-for-globals",
				cl::desc(
				"By default, we only rewrite globals with internal linkage. This flag "
				"enables rewriting of globals regardless of linkage"),
				cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

				#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
	namespace {			namespace {

	static llvm::Function *GetOrCreatePollyMallocManaged(Module &M) {			static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
	// TODO: should I allow this pass to be a standalone pass that
	// doesn't care if PollyManagedMemory is enabled or not?
	assert(PollyManagedMemory &&
	"One should only rewrite malloc & free to"
	"polly_{malloc,free}Managed with managed memory enabled.");
	const char *Name = "polly_mallocManaged";			const char *Name = "polly_mallocManaged";
	Function *F = M.getFunction(Name);			Function *F = M.getFunction(Name);

	// If F is not available, declare it.			// If F is not available, declare it.
	if (!F) {			if (!F) {
	GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;			GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
	PollyIRBuilder Builder(M.getContext());			PollyIRBuilder Builder(M.getContext());
	// TODO: How do I get `size_t`? I assume from DataLayout?			// TODO: How do I get `size_t`? I assume from DataLayout?
	FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),			FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
	{Builder.getInt64Ty()}, false);			{Builder.getInt64Ty()}, false);
	F = Function::Create(Ty, Linkage, Name, &M);			F = Function::Create(Ty, Linkage, Name, &M);
	}			}

	return F;			return F;
	}			}

	static llvm::Function *GetOrCreatePollyFreeManaged(Module &M) {			static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
	// TODO: should I allow this pass to be a standalone pass that
	// doesn't care if PollyManagedMemory is enabled or not?
	assert(PollyManagedMemory &&
	"One should only rewrite malloc & free to"
	"polly_{malloc,free}Managed with managed memory enabled.");
	const char *Name = "polly_freeManaged";			const char *Name = "polly_freeManaged";
	Function *F = M.getFunction(Name);			Function *F = M.getFunction(Name);

	// If F is not available, declare it.			// If F is not available, declare it.
	if (!F) {			if (!F) {
	GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;			GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
	PollyIRBuilder Builder(M.getContext());			PollyIRBuilder Builder(M.getContext());
	// TODO: How do I get `size_t`? I assume from DataLayout?			// TODO: How do I get `size_t`? I assume from DataLayout?
	FunctionType *Ty =			FunctionType *Ty =
	FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);			FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
	F = Function::Create(Ty, Linkage, Name, &M);			F = Function::Create(Ty, Linkage, Name, &M);
	}			}

	return F;			return F;
	}			}

				// Expand a constant expression `Cur`, which is used at instruction `Parent`
				// at index `index`.
				// Since a constant expression can expand to multiple instructions, store all
				// the expands into a set called `Expands`.
				// Note that this goes inorder on the constant expression tree.
				// A * ((B * D) + C)
				// will be processed with first A, then B * D, then B, then D, and then C.
				// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
				// have something like this:
				// *
				// / \
				// \ /
				// (D)
				//
				// For the purposes of this expansion, we expand the two occurences of D
				// separately. Therefore, we expand the DAG into the tree:
				// *
				// / \
				// D D
				// TODO: We don't _have_to do this, but this is the simplest solution.
				// We can write a solution that keeps track of which constants have been
				// already expanded.
				static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
				Instruction *Parent, int index,
				SmallPtrSet<Instruction *, 4> &Expands) {
				assert(Cur && "invalid constant expression passed");

				Instruction *I = Cur->getAsInstruction();
				Expands.insert(I);
				Parent->setOperand(index, I);

				assert(I && "unable to convert ConstantExpr to Instruction");
				// The things that `Parent` uses (its operands) should be created
				// before `Parent`.
				Builder.SetInsertPoint(Parent);
				Builder.Insert(I);

				DEBUG(dbgs() << "Expanding ConstantExpression: " << *Cur
				<< " \| in Instruction: " << *I << "\n";);
				for (unsigned i = 0; i < Cur->getNumOperands(); i++) {
				Value *Op = Cur->getOperand(i);
				assert(isa<Constant>(Op) && "constant must have a constant operand");

				if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
				expandConstantExpr(CExprOp, Builder, I, i, Expands);
				}
				}

				// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
				// `ConstantExpr`s that are used in the `Inst`.
				// Note that `replaceAllUsesWith` is insufficient for this purpose because it
				// does not rewrite values in `ConstantExpr`s.
				static void rewriteOldValToNew(Instruction Inst, Value OldVal, Value *NewVal,
				PollyIRBuilder &Builder) {

				// This contains a set of instructions in which OldVal must be replaced.
				// We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
				// from `Inst`s arguments.
				// We need to go through this process because `replaceAllUsesWith` does not
				// actually edit `ConstantExpr`s.
				SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};

				// Expand all `ConstantExpr`s and place it in `InstsToVisit`.
				for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
				Value *Operand = Inst->getOperand(i);
				if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
				expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
				}

				// Now visit each instruction and use `replaceUsesOfWith`. We know that
				// will work because `I` cannot have any `ConstantExpr` within it.
				for (Instruction *I : InstsToVisit)
				I->replaceUsesOfWith(OldVal, NewVal);
				}

				// Given a value `Current`, return all Instructions that may contain `Current`
				// in an expression.
				// We need this auxiliary function, because if we have a
				// `Constant` that is a user of `V`, we need to recurse into the
				// `Constant`s uses to gather the root instruciton.
				static void getInstructionUsersOfValue(Value *V,
				SmallVector<Instruction *, 4> &Owners) {
				if (auto *I = dyn_cast<Instruction>(V)) {
				Owners.push_back(I);
				} else {
				// Anything that is a `User` must be a constant or an instruction.
				auto *C = cast<Constant>(V);
				for (Use &CUse : C->uses())
				getInstructionUsersOfValue(CUse.getUser(), Owners);
				}
				}

				static void
				replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
				SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
				// We only want arrays.
				ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getType()->getElementType());
				if (!ArrayTy)
				return;
				Type *ElemTy = ArrayTy->getElementType();
				PointerType *ElemPtrTy = ElemTy->getPointerTo();

				// We only wish to replace arrays that are visible in the module they
				// inhabit. Otherwise, our type edit from [T] to T* would be illegal across
				// modules.
				const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() \|\|
				Array.hasInternalLinkage() \|\|
				IgnoreLinkageForGlobals;
				if (!OnlyVisibleInsideModule)
				return;

				if (!Array.hasInitializer() \|\|
				!isa<ConstantAggregateZero>(Array.getInitializer()))
				return;

				// At this point, we have committed to replacing this array.
				ReplacedGlobals.insert(&Array);

				std::string NewName = (Array.getName() + Twine(".toptr")).str();
				GlobalVariable *ReplacementToArr =
				cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
				ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));

				Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
				Twine FnName = Array.getName() + ".constructor";
				PollyIRBuilder Builder(M.getContext());
				FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
				const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
				Function *F = Function::Create(Ty, Linkage, FnName, &M);
				BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
				Builder.SetInsertPoint(Start);

				int ArraySizeInt = DL.getTypeAllocSizeInBits(ArrayTy) / 8;
				Value *ArraySize = Builder.getInt64(ArraySizeInt);
				ArraySize->setName("array.size");

				Value *AllocatedMemRaw =
				Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
				Value *AllocatedMemTyped =
				Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
				Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
				Builder.CreateRetVoid();

				const int Priority = 0;
				appendToGlobalCtors(M, F, Priority, ReplacementToArr);

				SmallVector<Instruction *, 4> ArrayUserInstructions;
				// Get all instructions that use array. We need to do this weird thing
				// because `Constant`s that contain this array neeed to be expanded into
				// instructions so that we can replace their parameters. `Constant`s cannot
				// be edited easily, so we choose to convert all `Constant`s to
				// `Instruction`s and handle all of the uses of `Array` uniformly.
				for (Use &ArrayUse : Array.uses())
				getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);

				for (Instruction *UserOfArrayInst : ArrayUserInstructions) {

				Builder.SetInsertPoint(UserOfArrayInst);
				// <ty>** -> <ty>*
				Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load");
				// <ty>* -> [ty]*
				Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
				ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
				rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
				}
				}

				// We return all `allocas` that may need to be converted to a call to
				// cudaMallocManaged.
				static void getAllocasToBeManaged(Function &F,
				SmallSet<AllocaInst *, 4> &Allocas) {
				for (BasicBlock &BB : F) {
				for (Instruction &I : BB) {
				auto *Alloca = dyn_cast<AllocaInst>(&I);
				if (!Alloca)
				continue;
				dbgs() << "Checking if " << *Alloca << "may be captured: ";

				if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
				/* StoreCaptures */ true)) {
				Allocas.insert(Alloca);
				DEBUG(dbgs() << "YES (captured)\n");
				} else {
				DEBUG(dbgs() << "NO (not captured)\n");
				}
				}
				}
				}

				static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
				const DataLayout &DL) {
				DEBUG(dbgs() << "rewriting: " << *Alloca << " to managed mem.\n");
				Module *M = Alloca->getModule();
				assert(M && "Alloca does not have a module");

				PollyIRBuilder Builder(M->getContext());
				Builder.SetInsertPoint(Alloca);

				Value MallocManagedFn = getOrCreatePollyMallocManaged(Alloca->getModule());
				const int Size = DL.getTypeAllocSize(Alloca->getType()->getElementType());
				Value *SizeVal = Builder.getInt64(Size);
				Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
				Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());

				Function *F = Alloca->getFunction();
				assert(F && "Alloca has invalid function");

				Bitcasted->takeName(Alloca);
				Alloca->replaceAllUsesWith(Bitcasted);
				Alloca->eraseFromParent();

				for (BasicBlock &BB : *F) {
				ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
				if (!Return)
				continue;
				Builder.SetInsertPoint(Return);

				Value FreeManagedFn = getOrCreatePollyFreeManaged(M);
				Builder.CreateCall(FreeManagedFn, {RawManagedMem});
				}
				}

	class ManagedMemoryRewritePass : public ModulePass {			class ManagedMemoryRewritePass : public ModulePass {
	public:			public:
	static char ID;			static char ID;
	GPUArch Architecture;			GPUArch Architecture;
	GPURuntime Runtime;			GPURuntime Runtime;

	ManagedMemoryRewritePass() : ModulePass(ID) {}			ManagedMemoryRewritePass() : ModulePass(ID) {}
	virtual bool runOnModule(Module &M) {			virtual bool runOnModule(Module &M) {
				const DataLayout &DL = M.getDataLayout();

	Function *Malloc = M.getFunction("malloc");			Function *Malloc = M.getFunction("malloc");

	if (Malloc) {			if (Malloc) {
	Function *PollyMallocManaged = GetOrCreatePollyMallocManaged(M);			Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
	assert(PollyMallocManaged && "unable to create polly_mallocManaged");			assert(PollyMallocManaged && "unable to create polly_mallocManaged");
	Malloc->replaceAllUsesWith(PollyMallocManaged);			Malloc->replaceAllUsesWith(PollyMallocManaged);
	Malloc->eraseFromParent();			Malloc->eraseFromParent();
	}			}

	Function *Free = M.getFunction("free");			Function *Free = M.getFunction("free");

	if (Free) {			if (Free) {
	Function *PollyFreeManaged = GetOrCreatePollyFreeManaged(M);			Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
	assert(PollyFreeManaged && "unable to create polly_freeManaged");			assert(PollyFreeManaged && "unable to create polly_freeManaged");
	Free->replaceAllUsesWith(PollyFreeManaged);			Free->replaceAllUsesWith(PollyFreeManaged);
	Free->eraseFromParent();			Free->eraseFromParent();
	}			}

				SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
				for (GlobalVariable &Global : M.globals())
				replaceGlobalArray(M, DL, Global, GlobalsToErase);
				for (GlobalVariable *G : GlobalsToErase)
				G->eraseFromParent();

				// Rewrite allocas to cudaMallocs if we are asked to do so.
				if (RewriteAllocas) {
				SmallSet<AllocaInst *, 4> AllocasToBeManaged;
				for (Function &F : M.functions())
				getAllocasToBeManaged(F, AllocasToBeManaged);

				for (AllocaInst *Alloca : AllocasToBeManaged)
				rewriteAllocaAsManagedMemory(Alloca, DL);
				}

	return true;			return true;
	}			}
	};			};

	} // namespace			} // namespace
	char ManagedMemoryRewritePass::ID = 42;			char ManagedMemoryRewritePass::ID = 42;

	Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,			Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
	Show All 22 Lines

polly/trunk/test/GPGPU/managed-memory-rewrite-alloca.ll

				; RUN: opt %loadPolly -analyze -polly-process-unprofitable \
				; RUN: -polly-scops -polly-use-llvm-names < %s \| FileCheck %s --check-prefix=SCOP

				; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \
				; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
				; RUN: -polly-acc-rewrite-managed-memory -polly-acc-rewrite-allocas < %s \| FileCheck %s --check-prefix=HOST-IR

				; REQUIRES: pollyacc

				; SCOP: Function: f
				; SCOP-NEXT: Region: %for.body---%for.end
				; SCOP-NEXT: Max Loop Depth: 1
				; SCOP: i32 MemRef_arr[*];

				; Check that we generate a constructor call for @A.toptr
				; HOST-IR-NOT: %arr = alloca [100 x i32]

				source_filename = "test.c"
				target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-apple-macosx10.12.0"


				define void @f() {
				entry:
				%arr = alloca [100 x i32]
				br label %entry.split

				entry.split: ; preds = %entry
				br label %for.body

				for.body: ; preds = %entry.split, %for.body
				%indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %arr, i64 0, i64 %indvars.iv1
				store i32 42, i32* %arrayidx, align 4, !tbaa !3
				%indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 100
				br i1 %exitcond, label %for.end, label %for.body

				for.end: ; preds = %for.body
				ret void
				}

				; Function Attrs: argmemonly nounwind
				declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0


				; Function Attrs: argmemonly nounwind
				declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0

				attributes #0 = { argmemonly nounwind }

				!llvm.module.flags = !{!0, !1}
				!llvm.ident = !{!2}

				!0 = !{i32 1, !"wchar_size", i32 4}
				!1 = !{i32 7, !"PIC Level", i32 2}
				!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"}
				!3 = !{!4, !4, i64 0}
				!4 = !{!"int", !5, i64 0}
				!5 = !{!"omnipotent char", !6, i64 0}
				!6 = !{!"Simple C/C++ TBAA"}

polly/trunk/test/GPGPU/simple-managed-memory-rewrite.ll

				; RUN: opt %loadPolly -analyze -polly-process-unprofitable \
				; RUN: -polly-scops -polly-use-llvm-names < %s \| FileCheck %s --check-prefix=SCOP

				; RUN: opt %loadPolly -S -polly-process-unprofitable -polly-acc-mincompute=0 \
				; RUN: -polly-target=gpu -polly-codegen-ppcg -polly-acc-codegen-managed-memory \
				; RUN: -polly-acc-rewrite-managed-memory < %s \| FileCheck %s --check-prefix=HOST-IR

				; REQUIRES: pollyacc

				; SCOP: Function: f
				; SCOP-NEXT: Region: %for.body---%for.end
				; SCOP-NEXT: Max Loop Depth: 1
				; SCOP: i32 MemRef_A[*];

				; Check that we generate a constructor call for @A.toptr
				; HOST-IR: @llvm.global_ctors = appending global [1 x { i32, void (), i8 }] [{ i32, void (), i8 } { i32 0, void ()* @.constructor, i8* bitcast (i32** @A.toptr to i8*) }]

				; Check that we generate a constructor
				; 4 bytes * 100 = 400
				; HOST-IR: define void @.constructor() {
				; HOST-IR-NEXT: entry:
				; HOST-IR-NEXT: %mem.raw = call i8* @polly_mallocManaged(i64 400)
				; HOST-IR-NEXT: %mem.typed = bitcast i8* %mem.raw to i32*
				; HOST-IR-NEXT: store i32* %mem.typed, i32** @A.toptr
				; HOST-IR-NEXT: ret void
				; HOST-IR-NEXT: }

				; HOST-IR-NOT: @A

				source_filename = "test.c"
				target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-apple-macosx10.12.0"

				@A = internal global [100 x i32] zeroinitializer, align 16

				define void @f() {
				entry:
				br label %entry.split

				entry.split: ; preds = %entry
				br label %for.body

				for.body: ; preds = %entry.split, %for.body
				%indvars.iv1 = phi i64 [ 0, %entry.split ], [ %indvars.iv.next, %for.body ]
				%arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @A, i64 0, i64 %indvars.iv1
				store i32 42, i32* %arrayidx, align 4, !tbaa !3
				%indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
				%exitcond = icmp eq i64 %indvars.iv.next, 100
				br i1 %exitcond, label %for.end, label %for.body

				for.end: ; preds = %for.body
				ret void
				}

				; Function Attrs: argmemonly nounwind
				declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0


				; Function Attrs: argmemonly nounwind
				declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0

				attributes #0 = { argmemonly nounwind }

				!llvm.module.flags = !{!0, !1}
				!llvm.ident = !{!2}

				!0 = !{i32 1, !"wchar_size", i32 4}
				!1 = !{i32 7, !"PIC Level", i32 2}
				!2 = !{!"clang version 6.0.0 (http://llvm.org/git/clang.git 6660f0d30ef23b3142a6b08f9f41aad3d47c084f) (http://llvm.org/git/llvm.git 052dd78cb30f77a05dc8bb06b851402c4b6c6587)"}
				!3 = !{!4, !4, i64 0}
				!4 = !{!"int", !5, i64 0}
				!5 = !{!"omnipotent char", !6, i64 0}
				!6 = !{!"Simple C/C++ TBAA"}

This is an archive of the discontinued LLVM Phabricator instance.

[WIP] [Polly] [ManagedMemoryRewrite] Rewrite global arrays with global pointers that are polly_mallocManage'dClosedPublic

Details

Diff Detail

Event Timeline