Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -441,13 +441,10 @@ /// @returns A tuple with grid sizes for X and Y dimension std::tuple getGridSizes(ppcg_kernel *Kernel); - /// Creates a array that can be sent to the kernel on the device using a - /// host pointer. This is required for managed memory, when we directly send - /// host pointers to the device. + /// Get the managed array pointer for sending host pointers to the device. /// \note /// This is to be used only with managed memory - Value *getOrCreateManagedDeviceArray(gpu_array_info *Array, - ScopArrayInfo *ArrayInfo); + Value *getManagedDeviceArray(gpu_array_info *Array, ScopArrayInfo *ArrayInfo); /// Compute the sizes of the thread blocks for a given kernel. /// @@ -652,6 +649,9 @@ /// Create code that allocates memory to store arrays on device. void allocateDeviceArrays(); + /// Create code to prepare the managed device pointers. + void prepareManagedDeviceArrays(); + /// Free all allocated device arrays. void freeDeviceArrays(); @@ -747,6 +747,8 @@ if (!ManagedMemory) allocateDeviceArrays(); + else + prepareManagedDeviceArrays(); } void GPUNodeBuilder::finalize() { @@ -783,6 +785,32 @@ isl_ast_build_free(Build); } +void GPUNodeBuilder::prepareManagedDeviceArrays() { + assert(ManagedMemory && + "Device array most only be prepared in managed-memory mode"); + for (int i = 0; i < Prog->n_array; ++i) { + gpu_array_info *Array = &Prog->array[i]; + ScopArrayInfo *ScopArray = (ScopArrayInfo *)Array->user; + Value *HostPtr; + + if (gpu_array_is_scalar(Array)) + HostPtr = BlockGen.getOrCreateAlloca(ScopArray); + else + HostPtr = ScopArray->getBasePtr(); + HostPtr = getLatestValue(HostPtr); + + Value *Offset = getArrayOffset(Array); + if (Offset) { + HostPtr = Builder.CreatePointerCast( + HostPtr, ScopArray->getElementType()->getPointerTo()); + HostPtr = Builder.CreateGEP(HostPtr, Offset); + } + + HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); + DeviceAllocations[ScopArray] = HostPtr; + } +} + void GPUNodeBuilder::addCUDAAnnotations(Module *M, Value *BlockDimX, Value *BlockDimY, Value *BlockDimZ) { auto AnnotationNode = M->getOrInsertNamedMetadata("nvvm.annotations"); @@ -1102,35 +1130,16 @@ return ExprBuilder.create(Result.release()); } -Value *GPUNodeBuilder::getOrCreateManagedDeviceArray(gpu_array_info *Array, - ScopArrayInfo *ArrayInfo) { - +Value *GPUNodeBuilder::getManagedDeviceArray(gpu_array_info *Array, + ScopArrayInfo *ArrayInfo) { assert(ManagedMemory && "Only used when you wish to get a host " "pointer for sending data to the kernel, " "with managed memory"); std::map::iterator it; - if ((it = DeviceAllocations.find(ArrayInfo)) != DeviceAllocations.end()) { - return it->second; - } else { - Value *HostPtr; - - if (gpu_array_is_scalar(Array)) - HostPtr = BlockGen.getOrCreateAlloca(ArrayInfo); - else - HostPtr = ArrayInfo->getBasePtr(); - HostPtr = getLatestValue(HostPtr); - - Value *Offset = getArrayOffset(Array); - if (Offset) { - HostPtr = Builder.CreatePointerCast( - HostPtr, ArrayInfo->getElementType()->getPointerTo()); - HostPtr = Builder.CreateGEP(HostPtr, Offset); - } - - HostPtr = Builder.CreatePointerCast(HostPtr, Builder.getInt8PtrTy()); - DeviceAllocations[ArrayInfo] = HostPtr; - return HostPtr; - } + it = DeviceAllocations.find(ArrayInfo); + assert(it != DeviceAllocations.end() && + "Device array expected to be available"); + return it->second; } void GPUNodeBuilder::createDataTransfer(__isl_take isl_ast_node *TransferStmt, @@ -1577,8 +1586,8 @@ Value *DevArray = nullptr; if (ManagedMemory) { - DevArray = getOrCreateManagedDeviceArray( - &Prog->array[i], const_cast(SAI)); + DevArray = getManagedDeviceArray(&Prog->array[i], + const_cast(SAI)); } else { DevArray = DeviceAllocations[const_cast(SAI)]; DevArray = createCallGetDevicePtr(DevArray); Index: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll +++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll @@ -37,13 +37,13 @@ ; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* -; CHECK-NEXT: %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: %15 = bitcast i32* %R to i8* +; CHECK-NEXT: %16 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 -; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8* -; CHECK-NEXT: store i8* %16, i8** %15 -; CHECK-NEXT: %17 = bitcast i32* %R to i8* +; CHECK-NEXT: %17 = bitcast i8** %polly_launch_0_param_0 to i8* +; CHECK-NEXT: store i8* %17, i8** %16 ; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 -; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 +; CHECK-NEXT: store i8* %15, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 ; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0 Index: polly/trunk/test/GPGPU/managed-pointers-preparation.ll =================================================================== --- polly/trunk/test/GPGPU/managed-pointers-preparation.ll +++ polly/trunk/test/GPGPU/managed-pointers-preparation.ll @@ -0,0 +1,105 @@ +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: -S -polly-acc-codegen-managed-memory < %s | FileCheck %s + +; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ +; RUN: -polly-invariant-load-hoisting \ +; RUN: -S -polly-acc-codegen-managed-memory -disable-output \ +; RUN: -polly-acc-dump-code < %s | FileCheck %s -check-prefix=CODE + +; CHECK: @polly_launchKernel +; CHECK: @polly_launchKernel +; CHECK: @polly_launchKernel +; CHECK: @polly_launchKernel +; CHECK: @polly_launchKernel +; CHECK-NOT: @polly_launchKernel + + +; CODE: if (p_0_loaded_from___data_runcontrol_MOD_lmulti_layer == 0) { +; CODE-NEXT: { +; CODE-NEXT: dim3 k0_dimBlock; +; CODE-NEXT: dim3 k0_dimGrid; +; CODE-NEXT: kernel0 <<>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CODE: } else { +; CODE-NEXT: { +; CODE-NEXT: dim3 k1_dimBlock; +; CODE-NEXT: dim3 k1_dimGrid; +; CODE-NEXT: kernel1 <<>> (dev_MemRef__pn__phi, p_0_loaded_from___data_runcontrol_MOD_lmulti_layer); +; CODE-NEXT: cudaCheckKernel(); +; CODE-NEXT: } + +; CHECK that this program is correctly code generated and does not result in +; 'instruction does not dominate use' errors. At an earlier point, such errors +; have been generated as the preparation of the managed memory pointers was +; performed right before kernel0, which does not dominate all other kernels. +; Now the preparation is performed at the very beginning of the scop. + +source_filename = "bugpoint-output-c78f41e.bc" +target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" +target triple = "x86_64-unknown-linux-gnu" + +@__data_radiation_MOD_rad_csalbw = external global [10 x double], align 32 +@__data_radiation_MOD_coai = external global [168 x double], align 32 +@__data_runcontrol_MOD_lmulti_layer = external global i32 + +; Function Attrs: nounwind uwtable +define void @__radiation_interface_MOD_radiation_init() #0 { +entry: + br label %"94" + +"94": ; preds = %"97", %entry + br label %"95" + +"95": ; preds = %"95", %"94" + br i1 undef, label %"97", label %"95" + +"97": ; preds = %"95" + br i1 undef, label %"99", label %"94" + +"99": ; preds = %"97" + br label %"102" + +"102": ; preds = %"102", %"99" + %indvars.iv17 = phi i64 [ %indvars.iv.next18, %"102" ], [ 1, %"99" ] + %0 = getelementptr [168 x double], [168 x double]* @__data_radiation_MOD_coai, i64 0, i64 0 + store double 1.000000e+00, double* %0, align 8 + %1 = icmp eq i64 %indvars.iv17, 3 + %indvars.iv.next18 = add nuw nsw i64 %indvars.iv17, 1 + br i1 %1, label %"110", label %"102" + +"110": ; preds = %"102" + %2 = load i32, i32* @__data_runcontrol_MOD_lmulti_layer, align 4, !range !0 + %3 = icmp eq i32 %2, 0 + br i1 %3, label %"112", label %"111" + +"111": ; preds = %"110" + br label %"115" + +"112": ; preds = %"110" + br label %"115" + +"115": ; preds = %"112", %"111" + %.pn = phi double [ undef, %"112" ], [ undef, %"111" ] + %4 = fdiv double 1.000000e+00, %.pn + br label %"116" + +"116": ; preds = %"116", %"115" + %indvars.iv = phi i64 [ %indvars.iv.next, %"116" ], [ 1, %"115" ] + %5 = add nsw i64 %indvars.iv, -1 + %6 = fmul double %4, undef + %7 = getelementptr [10 x double], [10 x double]* @__data_radiation_MOD_rad_csalbw, i64 0, i64 %5 + store double %6, double* %7, align 8 + %8 = icmp eq i64 %indvars.iv, 10 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br i1 %8, label %return, label %"116" + +return: ; preds = %"116" + ret void +} + +attributes #0 = { nounwind uwtable } + +!0 = !{i32 0, i32 2}