Index: lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- lib/CodeGen/PPCGCodeGeneration.cpp +++ lib/CodeGen/PPCGCodeGeneration.cpp @@ -1195,8 +1195,10 @@ Value * GPUNodeBuilder::createLaunchParameters(ppcg_kernel *Kernel, Function *F, SetVector SubtreeValues) { - Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), - std::distance(F->arg_begin(), F->arg_end())); + int NumArgs = std::distance(F->arg_begin(), F->arg_end()); + int *ArgSizes = new int[NumArgs]; + + Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); BasicBlock *EntryBlock = &Builder.GetInsertBlock()->getParent()->getEntryBlock(); @@ -1213,6 +1215,8 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id); + ArgSizes[Index] = SAI->getElemSizeInBytes(); + Value *DevArray = nullptr; if (ManagedMemory) { DevArray = getOrCreateManagedDeviceArray( @@ -1265,6 +1269,12 @@ isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + int SizeInBytes = Val->getType()->getPrimitiveSizeInBits() / 8; + if (SizeInBytes == 0) + SizeInBytes = Val->getType()->getScalarSizeInBits() / 8; + ArgSizes[Index] = SizeInBytes; + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), @@ -1284,6 +1294,12 @@ isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i); Value *Val = IDToValue[Id]; isl_id_free(Id); + + int SizeInBytes = Val->getType()->getPrimitiveSizeInBits() / 8; + if (SizeInBytes == 0) + SizeInBytes = Val->getType()->getScalarSizeInBits() / 8; + ArgSizes[Index] = SizeInBytes; + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), @@ -1298,6 +1314,11 @@ } for (auto Val : SubtreeValues) { + int SizeInBytes = Val->getType()->getPrimitiveSizeInBits() / 8; + if (SizeInBytes == 0) + SizeInBytes = Val->getType()->getScalarSizeInBits() / 8; + ArgSizes[Index] = SizeInBytes; + Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, Launch + "_param_" + std::to_string(Index), @@ -1311,6 +1332,23 @@ Index++; } + for (int i = 0; i < NumArgs; i++) { + Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); + Instruction *Param = + new AllocaInst(Builder.getInt32Ty(), AddressSpace, + Launch + "_param_size_" + std::to_string(i), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + Value *Slot = Builder.CreateGEP( + Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); + Value *ParamTyped = + Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); + Builder.CreateStore(ParamTyped, Slot); + Index++; + } + + delete[] ArgSizes; + auto Location = EntryBlock->getTerminator(); return new BitCastInst(Parameters, Builder.getInt8PtrTy(), Launch + "_params_i8ptr", Location); Index: test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- test/GPGPU/cuda-managed-memory-simple.ll +++ test/GPGPU/cuda-managed-memory-simple.ll @@ -37,18 +37,26 @@ ; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* -; CHECK-NEXT: %15 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: %15 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 ; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8* ; CHECK-NEXT: store i8* %16, i8** %15 ; CHECK-NEXT: %17 = bitcast i32* %R to i8* -; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %20) +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0 +; CHECK-NEXT: %20 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 2 +; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8* +; CHECK-NEXT: store i8* %21, i8** %20 +; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1 +; CHECK-NEXT: %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3 +; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8* +; CHECK-NEXT: store i8* %23, i8** %22 +; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %24) ; CHECK-NEXT: call void @polly_synchronizeDevice() ; CHECK-NEXT: call void @polly_freeContext(i8* %13) Index: test/GPGPU/host-control-flow.ll =================================================================== --- test/GPGPU/host-control-flow.ll +++ test/GPGPU/host-control-flow.ll @@ -32,7 +32,7 @@ ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] ; ... ; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] ; IR: call i8* @polly_getKernel Index: test/GPGPU/kernel-params-only-some-arrays.ll =================================================================== --- test/GPGPU/kernel-params-only-some-arrays.ll +++ test/GPGPU/kernel-params-only-some-arrays.ll @@ -48,13 +48,13 @@ ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] Index: test/GPGPU/parametric-loop-bound.ll =================================================================== --- test/GPGPU/parametric-loop-bound.ll +++ test/GPGPU/parametric-loop-bound.ll @@ -31,7 +31,7 @@ ; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); ; IR: store i64 %n, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] Index: tools/GPURuntime/GPUJIT.c =================================================================== --- tools/GPURuntime/GPUJIT.c +++ tools/GPURuntime/GPUJIT.c @@ -554,28 +554,12 @@ sizeof(cl_uint), &NumArgs, NULL); checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n"); - // TODO: Pass the size of the kernel arguments in to launchKernelCL, along - // with the arguments themselves. This is a dirty workaround that can be - // broken. + /* Argument sizes are stored at the end of the Parameters array. */ for (cl_uint i = 0; i < NumArgs; i++) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = - clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]); - if (Ret == CL_INVALID_ARG_SIZE) { - Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1, - (void *)Parameters[i]); - checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); - } - } - } - if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) { - fprintf(stderr, "Failed to set Kernel argument.\n"); - printOpenCLError(Ret); - exit(-1); - } + Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, + *((int *)Parameters[NumArgs + i]), + (void *)Parameters[i]); + checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i); } unsigned int GridDimZ = 1;