Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -142,6 +142,14 @@ return RefToExpr; } +/// Given a LLVM Type, compute its size in bytes, +static int computeSizeInBytes(const Type *T) { + int bytes = T->getPrimitiveSizeInBits() / 8; + if (bytes == 0) + bytes = T->getScalarSizeInBits() / 8; + return bytes; +} + /// Generate code for a GPU specific isl AST. /// /// The GPUNodeBuilder augments the general existing IslNodeBuilder, which @@ -272,6 +280,16 @@ /// @returns A tuple with thread block sizes for X, Y, and Z dimensions. std::tuple getBlockSizes(ppcg_kernel *Kernel); + /// Store a specific kernel launch parameter in the array of kernel launch + /// parameters. + /// + /// @param Parameters The list of parameters in which to store. + /// @param Param The kernel launch parameter to store. + /// @param Index The index in the parameter list, at which to store the + /// parameter. + void insertStoreParameter(Instruction *Parameters, Instruction *Param, + int Index); + /// Create kernel launch parameters. /// /// @param Kernel The kernel to create parameters for. @@ -1192,11 +1210,21 @@ return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]); } +void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters, + Instruction *Param, int Index) { + Value *Slot = Builder.CreateGEP( + Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); + Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); +} + Value * GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, SetVector SubtreeValues) { - Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), - std::distance(F->arg_begin(), F->arg_end())); + const int NumArgs = F->arg_size(); + std::vector ArgSizes(NumArgs); + + Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); BasicBlock *EntryBlock = &Builder.GetInsertBlock()->getParent()->getEntryBlock(); @@ -1213,6 +1241,8 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + ArgSizes[Index] = SAI->getElemSizeInBytes(); + Value *DevArray = nullptr; if (ManagedMemory) { DevArray = getOrCreateManagedDeviceArray( @@ -1265,16 +1295,15 @@ isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); Index++; } @@ -1284,30 +1313,38 @@ isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); Index++; } for (auto Val : SubtreeValues) { + ArgSizes[Index] = computeSizeInBytes(Val->getType()); + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), EntryBlock->getTerminator()); Builder.CreateStore(Val, Param); - Value *Slot = Builder.CreateGEP( - Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); - Value *ParamTyped = - Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); - Builder.CreateStore(ParamTyped, Slot); + insertStoreParameter(Parameters, Param, Index); + Index++; + } + + for (int i = 0; i < NumArgs; i++) { + Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); + Instruction *Param = + new AllocaInst(Builder.getInt32Ty(), AddressSpace, + Launch + "_param_size_" + std::to_string(i), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(Parameters, Param, Index); Index++; } Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- test/GPGPU/cuda-managed-memory-simple.ll +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -37,18 +37,26 @@ ; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* -; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 ; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8* ; CHECK-NEXT: store i8* %16, i8** %15 ; CHECK-NEXT: %17 = bitcast i32* %R to i8* -; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %20) +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0 +; CHECK-NEXT: %20 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 2 +; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8* +; CHECK-NEXT: store i8* %21, i8** %20 +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1 +; CHECK-NEXT: %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3 +; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8* +; CHECK-NEXT: store i8* %23, i8** %22 +; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %24) ; CHECK-NEXT: call void @polly_synchronizeDevice() ; CHECK-NEXT: call void @polly_freeContext(i8* %13) Index: test/GPGPU/host-control-flow.ll =================================================================== --- test/GPGPU/host-control-flow.ll +++ test/GPGPU/host-control-flow.ll @@ -32,7 +32,7 @@ ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] ; ... ; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] ; IR: call i8* @polly_getKernel Index: test/GPGPU/kernel-params-only-some-arrays.ll =================================================================== --- test/GPGPU/kernel-params-only-some-arrays.ll +++ test/GPGPU/kernel-params-only-some-arrays.ll @@ -48,13 +48,13 @@ ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] Index: test/GPGPU/parametric-loop-bound.ll =================================================================== --- test/GPGPU/parametric-loop-bound.ll +++ test/GPGPU/parametric-loop-bound.ll @@ -31,7 +31,7 @@ ; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); ; IR: store i64 %n, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -554,28 +554,12 @@ sizeof(cl_uint), &NumArgs, NULL); checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); - // TODO: Pass the size of the kernel arguments in to launchKernelCL, along - // with the arguments themselves. This is a dirty workaround that can be - // broken. + /* Argument sizes are stored at the end of the Parameters array. */ for (cl_uint i = 0; i < NumArgs; i++) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = - clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, - (void *)Parameters[i]); - checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); - } - } - } - if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { - fprintf(stderr, "Failed to set Kernel argument.\n"); - printOpenCLError(Ret); - exit(-1); - } + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, + *((int *)Parameters[NumArgs + i]), + (void *)Parameters[i]); + checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); } unsigned int GridDimZ = 1;