Index: polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp =================================================================== --- polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp +++ polly/trunk/lib/CodeGen/PPCGCodeGeneration.cpp @@ -1591,7 +1591,15 @@ const int NumArgs = F->arg_size(); std::vector ArgSizes(NumArgs); - Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); + // If we are using the OpenCL Runtime, we need to add the kernel argument + // sizes to the end of the launch-parameter list, so OpenCL can determine + // how big the respective kernel arguments are. + // Here we need to reserve adequate space for that. + Type *ArrayTy; + if (Runtime == GPURuntime::OpenCL) + ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 * NumArgs); + else + ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), NumArgs); BasicBlock *EntryBlock = &Builder.GetInsertBlock()->getParent()->getEntryBlock(); @@ -1608,7 +1616,8 @@ isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set); const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(isl::manage(Id)); - ArgSizes[Index] = SAI->getElemSizeInBytes(); + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = SAI->getElemSizeInBytes(); Value *DevArray = nullptr; if (PollyManagedMemory) { @@ -1663,7 +1672,8 @@ Value *Val = IDToValue[Id]; isl_id_free(Id); - ArgSizes[Index] = computeSizeInBytes(Val->getType()); + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, @@ -1683,7 +1693,8 @@ Val = ValueMap[Val]; isl_id_free(Id); - ArgSizes[Index] = computeSizeInBytes(Val->getType()); + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, @@ -1695,7 +1706,8 @@ } for (auto Val : SubtreeValues) { - ArgSizes[Index] = computeSizeInBytes(Val->getType()); + if (Runtime == GPURuntime::OpenCL) + ArgSizes[Index] = computeSizeInBytes(Val->getType()); Instruction *Param = new AllocaInst(Val->getType(), AddressSpace, @@ -1706,15 +1718,17 @@ Index++; } - for (int i = 0; i < NumArgs; i++) { - Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); - Instruction *Param = - new AllocaInst(Builder.getInt32Ty(), AddressSpace, - Launch + "_param_size_" + std::to_string(i), - EntryBlock->getTerminator()); - Builder.CreateStore(Val, Param); - insertStoreParameter(Parameters, Param, Index); - Index++; + if (Runtime == GPURuntime::OpenCL) { + for (int i = 0; i < NumArgs; i++) { + Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]); + Instruction *Param = + new AllocaInst(Builder.getInt32Ty(), AddressSpace, + Launch + "_param_size_" + std::to_string(i), + EntryBlock->getTerminator()); + Builder.CreateStore(Val, Param); + insertStoreParameter(Parameters, Param, Index); + Index++; + } } auto Location = EntryBlock->getTerminator(); Index: polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll =================================================================== --- polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll +++ polly/trunk/test/GPGPU/cuda-managed-memory-simple.ll @@ -38,25 +38,17 @@ ; CHECK: %13 = call i8* @polly_initContextCUDA() ; CHECK-NEXT: %14 = bitcast i32* %A to i8* ; CHECK-NEXT: %15 = bitcast i32* %R to i8* -; CHECK-NEXT: %16 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; CHECK-NEXT: %16 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0 ; CHECK-NEXT: %17 = bitcast i8** %polly_launch_0_param_0 to i8* ; CHECK-NEXT: store i8* %17, i8** %16 -; CHECK-NEXT: %18 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; CHECK-NEXT: %18 = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; CHECK-NEXT: store i8* %15, i8** %polly_launch_0_param_1 ; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8* ; CHECK-NEXT: store i8* %19, i8** %18 -; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0 -; CHECK-NEXT: %20 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 2 -; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8* -; CHECK-NEXT: store i8* %21, i8** %20 -; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1 -; CHECK-NEXT: %22 = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 3 -; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8* -; CHECK-NEXT: store i8* %23, i8** %22 -; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) -; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; CHECK-NEXT: call void @polly_freeKernel(i8* %24) +; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0)) +; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; CHECK-NEXT: call void @polly_freeKernel(i8* %20) ; CHECK-NEXT: call void @polly_synchronizeDevice() ; CHECK-NEXT: call void @polly_freeContext(i8* %13) Index: polly/trunk/test/GPGPU/host-control-flow.ll =================================================================== --- polly/trunk/test/GPGPU/host-control-flow.ll +++ polly/trunk/test/GPGPU/host-control-flow.ll @@ -31,7 +31,7 @@ ; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ] ; ... ; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] ; IR: call i8* @polly_getKernel Index: polly/trunk/test/GPGPU/invariant-load-hoisting.ll =================================================================== --- polly/trunk/test/GPGPU/invariant-load-hoisting.ll +++ polly/trunk/test/GPGPU/invariant-load-hoisting.ll @@ -17,8 +17,8 @@ ; SCOP-NEXT: [n, tmp12] -> { Stmt_for_body6[i0, i1, i2] -> MemRef_invariant[0] }; ; SCOP-NEXT: Execution Context: [n, tmp12] -> { : n > 0 } ; SCOP-NEXT: } -; HOST-IR: call void @polly_launchKernel(i8* %219, i32 %225, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) -; HOST-IR-NEXT: call void @polly_freeKernel(i8* %219) +; HOST-IR: call void @polly_launchKernel(i8* %209, i32 %215, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr) +; HOST-IR-NEXT: call void @polly_freeKernel(i8* %209) ; KERNEL-IR: define ptx_kernel void @FUNC_f_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B, i8 addrspace(1)* %MemRef_A, i32 %n, i32 %tmp12, i32 %polly.preload.tmp21.merge) Index: polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll =================================================================== --- polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll +++ polly/trunk/test/GPGPU/kernel-params-only-some-arrays.ll @@ -48,13 +48,13 @@ ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_0_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] ; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) -; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0 +; IR-NEXT: [[SLOT:%.*]] = getelementptr [1 x i8*], [1 x i8*]* %polly_launch_1_params, i64 0, i64 0 ; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0 ; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8* ; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]] Index: polly/trunk/test/GPGPU/parametric-loop-bound.ll =================================================================== --- polly/trunk/test/GPGPU/parametric-loop-bound.ll +++ polly/trunk/test/GPGPU/parametric-loop-bound.ll @@ -32,7 +32,7 @@ ; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0); ; IR: store i64 %n, i64* %polly_launch_0_param_1 -; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8*], [4 x i8*]* %polly_launch_0_params, i64 0, i64 1 +; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1 ; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8* ; IR-NEXT: store i8* [[REGB]], i8** [[REGA]] Index: polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll =================================================================== --- polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll +++ polly/trunk/test/GPGPU/phi-nodes-in-kernel.ll @@ -48,7 +48,7 @@ ; CODE-NEXT: if (32 * b0 + t0 <= 48) ; CODE-NEXT: Stmt_for_body17(0, 32 * b0 + t0); -; IR: [[REGC:%.+]] = bitcast i32* %27 to i8* +; IR: [[REGC:%.+]] = bitcast i32* %23 to i8* ; IR-NEXT: call void @polly_copyFromDeviceToHost(i8* %p_dev_array_MemRef_c, i8* [[REGC]], i64 196) ; KERNEL-IR: define ptx_kernel void @FUNC_kernel_dynprog_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_c, i32) #0 {