Diff 98186

lib/CodeGen/PPCGCodeGeneration.cpp

Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines	for (MemoryAccess Acc : Stmt) {
isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);		isl_ast_expr *Access = isl_ast_build_access_from_multi_pw_aff(Build, MPA);
Access = FunctionExpr(Access, RefId, UserExpr);		Access = FunctionExpr(Access, RefId, UserExpr);
RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);		RefToExpr = isl_id_to_ast_expr_set(RefToExpr, RefId, Access);
}		}

return RefToExpr;		return RefToExpr;
}		}

		/// Given a LLVM Type, compute its size in bytes,
		static int computeSizeInBytes(Type *T) {
		bolluUnsubmitted Done Reply Inline Actions `getPrimitiveSizeInBits` and `getScalarSizeInBits` accept `const Type T`. Could you please change the `Type T` to `const Type T`? bollu:* `getPrimitiveSizeInBits` and `getScalarSizeInBits` accept `const Type *T`. Could you please…
		int bytes = T->getPrimitiveSizeInBits() / 8;
		if (bytes == 0)
		bytes = T->getScalarSizeInBits() / 8;
		return bytes;
		}

/// Generate code for a GPU specific isl AST.		/// Generate code for a GPU specific isl AST.
///		///
/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which		/// The GPUNodeBuilder augments the general existing IslNodeBuilder, which
/// generates code for general-prupose AST nodes, with special functionality		/// generates code for general-prupose AST nodes, with special functionality
/// for generating GPU specific user nodes.		/// for generating GPU specific user nodes.
///		///
/// @see GPUNodeBuilder::createUser		/// @see GPUNodeBuilder::createUser
class GPUNodeBuilder : public IslNodeBuilder {		class GPUNodeBuilder : public IslNodeBuilder {
▲ Show 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	private:

/// Compute the sizes of the thread blocks for a given kernel.		/// Compute the sizes of the thread blocks for a given kernel.
///		///
/// @param Kernel The kernel to compute thread block sizes for.		/// @param Kernel The kernel to compute thread block sizes for.
///		///
/// @returns A tuple with thread block sizes for X, Y, and Z dimensions.		/// @returns A tuple with thread block sizes for X, Y, and Z dimensions.
std::tuple<Value , Value , Value > getBlockSizes(ppcg_kernel Kernel);		std::tuple<Value , Value , Value > getBlockSizes(ppcg_kernel Kernel);

		/// Store a specific kernel launch parameter in the array of kernel launch
		/// parameters.
		///
		/// @param Parameters The list of parameters in which to store.
		/// @param Param The kernel launch parameter to store.
		/// @param Index The index in the parameter list, at which to store the
		/// parameter.
		void insertStoreParameter(Instruction Parameters, Instruction Param,
		int Index);

/// Create kernel launch parameters.		/// Create kernel launch parameters.
///		///
/// @param Kernel The kernel to create parameters for.		/// @param Kernel The kernel to create parameters for.
/// @param F The kernel function that has been created.		/// @param F The kernel function that has been created.
/// @param SubtreeValues The set of llvm::Values referenced by this kernel.		/// @param SubtreeValues The set of llvm::Values referenced by this kernel.
///		///
/// @returns A stack allocated array with pointers to the parameter		/// @returns A stack allocated array with pointers to the parameter
/// values that are passed to the kernel.		/// values that are passed to the kernel.
▲ Show 20 Lines • Show All 904 Lines • ▼ Show 20 Lines	GPUNodeBuilder::getBlockSizes(ppcg_kernel *Kernel) {
}		}

for (long i = Kernel->n_block; i < 3; i++)		for (long i = Kernel->n_block; i < 3; i++)
Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));		Sizes.push_back(ConstantInt::get(Builder.getInt32Ty(), 1));

return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);		return std::make_tuple(Sizes[0], Sizes[1], Sizes[2]);
}		}

		void GPUNodeBuilder::insertStoreParameter(Instruction *Parameters,
		Instruction *Param, int Index) {
		Value *Slot = Builder.CreateGEP(
		Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
		Value *ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
		Builder.CreateStore(ParamTyped, Slot);
		}

Value *		Value *
GPUNodeBuilder::createLaunchParameters(ppcg_kernel Kernel, Function F,		GPUNodeBuilder::createLaunchParameters(ppcg_kernel Kernel, Function F,
SetVector<Value *> SubtreeValues) {		SetVector<Value *> SubtreeValues) {
Type *ArrayTy = ArrayType::get(Builder.getInt8PtrTy(),		const int NumArgs = F->arg_size();
std::distance(F->arg_begin(), F->arg_end()));		std::vector<int> ArgSizes = std::vector<int>(NumArgs);
		bolluUnsubmitted Done Reply Inline Actions why not `std::vector<int>(NumArgs)`? That removes the explicit memory allocation/deallocation. I wish `std::dynarray` existed, since that's exactly the use case for this. bollu: why not `std::vector<int>(NumArgs)`? That removes the explicit memory allocation/deallocation.
		bolluUnsubmitted Done Reply Inline Actions Have you considered `std::vector<int> ArgSizes(NumArgs);`? I find this more idiomatic. bollu: Have you considered `std::vector<int> ArgSizes(NumArgs);`? I find this more idiomatic.

		Type ArrayTy = ArrayType::get(Builder.getInt8PtrTy(), 2 NumArgs);
		bolluUnsubmitted Done Reply Inline Actions I believe `int NumArgs = F->getArgumentList().size()` will also work? seems a little clearer to me. bollu: I believe `int NumArgs = F->getArgumentList().size()` will also work? seems a little clearer to…
		PhilippSchaadAuthorUnsubmitted Done Reply Inline Actions The type llvm::Function does not seem to have a member getArgumentList()? Also, I have just adapted that from before, but I agree that something like that would be clearer. PhilippSchaad: The type llvm::Function does not seem to have a member getArgumentList()? Also, I have just…
		bolluUnsubmitted Done Reply Inline Actions Hm, it looks like it has been removed. [I was looking at an older reference to `Function::getArgumentList`](http://llvm.org/docs/doxygen/html/classllvm_1_1Function.html#a0a46edcf9b885556850d8ed9c49d9b52). Apologies! I believe that [`llvm::Function::arg_size()`](http://llvm.org/doxygen/classllvm_1_1Function.html#abccf59dbcc12707d079124e6bcfb4a47) should work for this purpose. bollu: Hm, it looks like it has been removed. [I was looking at an older reference to `Function…
		bolluUnsubmitted Done Reply Inline Actions `NumArgs` could be `const`? It doesn't seem to be mutated anywhere. bollu: `NumArgs` could be `const`? It doesn't seem to be mutated anywhere.

BasicBlock *EntryBlock =		BasicBlock *EntryBlock =
&Builder.GetInsertBlock()->getParent()->getEntryBlock();		&Builder.GetInsertBlock()->getParent()->getEntryBlock();
auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();		auto AddressSpace = F->getParent()->getDataLayout().getAllocaAddrSpace();
std::string Launch = "polly_launch_" + std::to_string(Kernel->id);		std::string Launch = "polly_launch_" + std::to_string(Kernel->id);
Instruction *Parameters = new AllocaInst(		Instruction *Parameters = new AllocaInst(
ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());		ArrayTy, AddressSpace, Launch + "_params", EntryBlock->getTerminator());

int Index = 0;		int Index = 0;
for (long i = 0; i < Prog->n_array; i++) {		for (long i = 0; i < Prog->n_array; i++) {
if (!ppcg_kernel_requires_array_argument(Kernel, i))		if (!ppcg_kernel_requires_array_argument(Kernel, i))
continue;		continue;

isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);		isl_id *Id = isl_space_get_tuple_id(Prog->array[i].space, isl_dim_set);
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);		const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);

		ArgSizes[Index] = SAI->getElemSizeInBytes();

Value *DevArray = nullptr;		Value *DevArray = nullptr;
if (ManagedMemory) {		if (ManagedMemory) {
DevArray = getOrCreateManagedDeviceArray(		DevArray = getOrCreateManagedDeviceArray(
&Prog->array[i], const_cast<ScopArrayInfo *>(SAI));		&Prog->array[i], const_cast<ScopArrayInfo *>(SAI));
} else {		} else {
DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];		DevArray = DeviceAllocations[const_cast<ScopArrayInfo *>(SAI)];
DevArray = createCallGetDevicePtr(DevArray);		DevArray = createCallGetDevicePtr(DevArray);
}		}
Show All 36 Lines	GPUNodeBuilder::createLaunchParameters(ppcg_kernel Kernel, Function F,
}		}

int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);		int NumHostIters = isl_space_dim(Kernel->space, isl_dim_set);

for (long i = 0; i < NumHostIters; i++) {		for (long i = 0; i < NumHostIters; i++) {
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);		isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_set, i);
Value *Val = IDToValue[Id];		Value *Val = IDToValue[Id];
isl_id_free(Id);		isl_id_free(Id);

		ArgSizes[Index] = computeSizeInBytes(Val->getType());

Instruction *Param =		Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,		new AllocaInst(Val->getType(), AddressSpace,
		bolluUnsubmitted Done Reply Inline Actions The computation of `SizeInBytes` seems to be a pure computation which is repeated thrice in this function (`GPUNodeBuilder::createLaunchParameters`). Could this be refactored into a standalone function? That would also make the assignment look a little cleaner: ArgSizes[Index] = computeSizeInBytes(Val->getType()); bollu: The computation of `SizeInBytes` seems to be a pure computation which is repeated thrice in…
Launch + "_param_" + std::to_string(Index),		Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());		EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);		Builder.CreateStore(Val, Param);
Value *Slot = Builder.CreateGEP(		insertStoreParameter(Parameters, Param, Index);
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
Value *ParamTyped =
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
Builder.CreateStore(ParamTyped, Slot);
Index++;		Index++;
}		}

int NumVars = isl_space_dim(Kernel->space, isl_dim_param);		int NumVars = isl_space_dim(Kernel->space, isl_dim_param);

for (long i = 0; i < NumVars; i++) {		for (long i = 0; i < NumVars; i++) {
isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);		isl_id *Id = isl_space_get_dim_id(Kernel->space, isl_dim_param, i);
Value *Val = IDToValue[Id];		Value *Val = IDToValue[Id];
isl_id_free(Id);		isl_id_free(Id);

		ArgSizes[Index] = computeSizeInBytes(Val->getType());

Instruction *Param =		Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,		new AllocaInst(Val->getType(), AddressSpace,
Launch + "_param_" + std::to_string(Index),		Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());		EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);		Builder.CreateStore(Val, Param);
Value *Slot = Builder.CreateGEP(		insertStoreParameter(Parameters, Param, Index);
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});
Value *ParamTyped =
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
Builder.CreateStore(ParamTyped, Slot);
Index++;		Index++;
}		}

for (auto Val : SubtreeValues) {		for (auto Val : SubtreeValues) {
		ArgSizes[Index] = computeSizeInBytes(Val->getType());

Instruction *Param =		Instruction *Param =
new AllocaInst(Val->getType(), AddressSpace,		new AllocaInst(Val->getType(), AddressSpace,
Launch + "_param_" + std::to_string(Index),		Launch + "_param_" + std::to_string(Index),
EntryBlock->getTerminator());		EntryBlock->getTerminator());
Builder.CreateStore(Val, Param);		Builder.CreateStore(Val, Param);
Value *Slot = Builder.CreateGEP(		insertStoreParameter(Parameters, Param, Index);
Parameters, {Builder.getInt64(0), Builder.getInt64(Index)});		Index++;
Value *ParamTyped =		}
Builder.CreatePointerCast(Param, Builder.getInt8PtrTy());
Builder.CreateStore(ParamTyped, Slot);		for (int i = 0; i < NumArgs; i++) {
		Value *Val = ConstantInt::get(Builder.getInt32Ty(), ArgSizes[i]);
		Instruction *Param =
		new AllocaInst(Builder.getInt32Ty(), AddressSpace,
		Launch + "_param_size_" + std::to_string(i),
		EntryBlock->getTerminator());
		Builder.CreateStore(Val, Param);
		insertStoreParameter(Parameters, Param, Index);
Index++;		Index++;
}		}

auto Location = EntryBlock->getTerminator();		auto Location = EntryBlock->getTerminator();
return new BitCastInst(Parameters, Builder.getInt8PtrTy(),		return new BitCastInst(Parameters, Builder.getInt8PtrTy(),
Launch + "_params_i8ptr", Location);		Launch + "_params_i8ptr", Location);
}		}

void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {		void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
isl_id *Id = isl_ast_node_get_annotation(KernelStmt);		isl_id *Id = isl_ast_node_get_annotation(KernelStmt);
ppcg_kernel Kernel = (ppcg_kernel )isl_id_get_user(Id);		ppcg_kernel Kernel = (ppcg_kernel )isl_id_get_user(Id);
isl_id_free(Id);		isl_id_free(Id);
isl_ast_node_free(KernelStmt);		isl_ast_node_free(KernelStmt);

if (Kernel->n_grid > 1)		if (Kernel->n_grid > 1)
DeepestParallel =		DeepestParallel =
		bolluUnsubmitted Done Reply Inline Actions Could the pattern of to-split.cpp Value Slot = Builder.CreateGEP( Parameters, {Builder.getInt64(0), Builder.getInt64(Index)}); Value ParamTyped = Builder.CreatePointerCast(Param, Builder.getInt8PtrTy()); Builder.CreateStore(ParamTyped, Slot); be refactored into a separate function if it is not too much trouble? It occurs thrice in this function (`GPUNodeBuilder::createLaunchParameters`) bollu: Could the pattern of ```lang=cpp, name=to-split.cpp Value *Slot = Builder.CreateGEP…
		PhilippSchaadAuthorUnsubmitted Done Reply Inline Actions Looking into it. PhilippSchaad: Looking into it.
std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));		std::max(DeepestParallel, isl_space_dim(Kernel->space, isl_dim_set));
else		else
DeepestSequential =		DeepestSequential =
std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));		std::max(DeepestSequential, isl_space_dim(Kernel->space, isl_dim_set));

Value BlockDimX, BlockDimY, *BlockDimZ;		Value BlockDimX, BlockDimY, *BlockDimZ;
std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);		std::tie(BlockDimX, BlockDimY, BlockDimZ) = getBlockSizes(Kernel);

▲ Show 20 Lines • Show All 1,365 Lines • Show Last 20 Lines

test/GPGPU/cuda-managed-memory-simple.ll

	Show All 31 Lines

	; CHECK-NOT: polly_copyFromHostToDevice			; CHECK-NOT: polly_copyFromHostToDevice
	; CHECK-NOT: polly_copyFromDeviceToHost			; CHECK-NOT: polly_copyFromDeviceToHost
	; CHECK-NOT: polly_freeDeviceMemory			; CHECK-NOT: polly_freeDeviceMemory
	; CHECK-NOT: polly_allocateMemoryForDevice			; CHECK-NOT: polly_allocateMemoryForDevice

	; CHECK: %13 = call i8* @polly_initContextCUDA()			; CHECK: %13 = call i8* @polly_initContextCUDA()
	; CHECK-NEXT: %14 = bitcast i32* %A to i8*			; CHECK-NEXT: %14 = bitcast i32* %A to i8*
	; CHECK-NEXT: %15 = getelementptr [2 x i8], [2 x i8]* %polly_launch_0_params, i64 0, i64 0			; CHECK-NEXT: %15 = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 0
	; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0			; CHECK-NEXT: store i8* %14, i8** %polly_launch_0_param_0
	; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8*			; CHECK-NEXT: %16 = bitcast i8** %polly_launch_0_param_0 to i8*
	; CHECK-NEXT: store i8* %16, i8** %15			; CHECK-NEXT: store i8* %16, i8** %15
	; CHECK-NEXT: %17 = bitcast i32* %R to i8*			; CHECK-NEXT: %17 = bitcast i32* %R to i8*
	; CHECK-NEXT: %18 = getelementptr [2 x i8], [2 x i8]* %polly_launch_0_params, i64 0, i64 1			; CHECK-NEXT: %18 = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 1
	; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1			; CHECK-NEXT: store i8* %17, i8** %polly_launch_0_param_1
	; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*			; CHECK-NEXT: %19 = bitcast i8** %polly_launch_0_param_1 to i8*
	; CHECK-NEXT: store i8* %19, i8** %18			; CHECK-NEXT: store i8* %19, i8** %18
	; CHECK-NEXT: %20 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))			; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_0
	; CHECK-NEXT: call void @polly_launchKernel(i8* %20, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)			; CHECK-NEXT: %20 = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 2
	; CHECK-NEXT: call void @polly_freeKernel(i8* %20)			; CHECK-NEXT: %21 = bitcast i32* %polly_launch_0_param_size_0 to i8*
				; CHECK-NEXT: store i8* %21, i8** %20
				; CHECK-NEXT: store i32 4, i32* %polly_launch_0_param_size_1
				; CHECK-NEXT: %22 = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 3
				; CHECK-NEXT: %23 = bitcast i32* %polly_launch_0_param_size_1 to i8*
				; CHECK-NEXT: store i8* %23, i8** %22
				; CHECK-NEXT: %24 = call i8* @polly_getKernel(i8* getelementptr inbounds ([750 x i8], [750 x i8]* @kernel_0, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @kernel_0_name, i32 0, i32 0))
				; CHECK-NEXT: call void @polly_launchKernel(i8* %24, i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
				; CHECK-NEXT: call void @polly_freeKernel(i8* %24)
	; CHECK-NEXT: call void @polly_synchronizeDevice()			; CHECK-NEXT: call void @polly_synchronizeDevice()
	; CHECK-NEXT: call void @polly_freeContext(i8* %13)			; CHECK-NEXT: call void @polly_freeContext(i8* %13)

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

	define void @copy(i32* %R, i32* %A) {			define void @copy(i32* %R, i32* %A) {
	entry:			entry:
	br label %for.cond			br label %for.cond
	▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

test/GPGPU/host-control-flow.ll

	Show All 26 Lines

	; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));			; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
	; CODE-NEXT: }			; CODE-NEXT: }

	; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader			; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
	; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]			; IR-NEXT: %polly.indvar = phi i64 [ 0, %polly.loop_preheader ], [ %polly.indvar_next, %polly.loop_header ]
	; ...			; ...
	; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1			; IR: store i64 %polly.indvar, i64* %polly_launch_0_param_1
	; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8], [2 x i8]* %polly_launch_0_params, i64 0, i64 1			; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 1
	; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*			; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
	; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]			; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]
	; IR: call i8* @polly_getKernel			; IR: call i8* @polly_getKernel
	; ...			; ...
	; IR: call void @polly_freeKernel			; IR: call void @polly_freeKernel
	; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1			; IR-NEXT: %polly.indvar_next = add nsw i64 %polly.indvar, 1
	; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98			; IR-NEXT: %polly.loop_cond = icmp sle i64 %polly.indvar, 98
	; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit			; IR-NEXT: br i1 %polly.loop_cond, label %polly.loop_header, label %polly.loop_exit
	▲ Show 20 Lines • Show All 134 Lines • Show Last 20 Lines

test/GPGPU/kernel-params-only-some-arrays.ll

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()			; KERNEL-NEXT: %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
	; KERNEL-NEXT: %t0 = zext i32 %1 to i64			; KERNEL-NEXT: %t0 = zext i32 %1 to i64

	; KERNEL: ret void			; KERNEL: ret void
	; KERNEL-NEXT: }			; KERNEL-NEXT: }


	; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)			; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
	; IR-NEXT: [[SLOT:%.]] = getelementptr [1 x i8], [1 x i8] %polly_launch_0_params, i64 0, i64 0			; IR-NEXT: [[SLOT:%.]] = getelementptr [2 x i8], [2 x i8] %polly_launch_0_params, i64 0, i64 0
	; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0			; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
	; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_0_param_0 to i8*			; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_0_param_0 to i8*
	; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]			; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]

	; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)			; IR: [[DEVPTR:%.]] = call i8 @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
	; IR-NEXT: [[SLOT:%.]] = getelementptr [1 x i8], [1 x i8] %polly_launch_1_params, i64 0, i64 0			; IR-NEXT: [[SLOT:%.]] = getelementptr [2 x i8], [2 x i8] %polly_launch_1_params, i64 0, i64 0
	; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0			; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
	; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_1_param_0 to i8*			; IR-NEXT: [[DATA:%.]] = bitcast i8* %polly_launch_1_param_0 to i8*
	; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]			; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

	define void @kernel_params_only_some_arrays(float* %A, float* %B) {			define void @kernel_params_only_some_arrays(float* %A, float* %B) {
	entry:			entry:
	Show All 40 Lines

test/GPGPU/parametric-loop-bound.ll

	Show All 25 Lines
	; CODE-NEXT: }			; CODE-NEXT: }

	; CODE: # kernel0			; CODE: # kernel0
	; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1)			; CODE-NEXT: for (int c0 = 0; c0 <= (n - 32 * b0 - 1) / 1048576; c0 += 1)
	; CODE-NEXT: if (n >= 32 * b0 + t0 + 1048576 * c0 + 1)			; CODE-NEXT: if (n >= 32 * b0 + t0 + 1048576 * c0 + 1)
	; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0);			; CODE-NEXT: Stmt_bb2(32 * b0 + t0 + 1048576 * c0);

	; IR: store i64 %n, i64* %polly_launch_0_param_1			; IR: store i64 %n, i64* %polly_launch_0_param_1
	; IR-NEXT: [[REGA:%.+]] = getelementptr [2 x i8], [2 x i8]* %polly_launch_0_params, i64 0, i64 1			; IR-NEXT: [[REGA:%.+]] = getelementptr [4 x i8], [4 x i8]* %polly_launch_0_params, i64 0, i64 1
	; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*			; IR-NEXT: [[REGB:%.+]] = bitcast i64* %polly_launch_0_param_1 to i8*
	; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]			; IR-NEXT: store i8* [[REGB]], i8** [[REGA]]

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

	define void @foo(i64* %A, i64 %n) {			define void @foo(i64* %A, i64 %n) {
	bb:			bb:
	br label %bb1			br label %bb1
	Show All 20 Lines

tools/GPURuntime/GPUJIT.c

Show First 20 Lines • Show All 548 Lines • ▼ Show 20 Lines	if (!GlobalContext) {
exit(-1);		exit(-1);
}		}

OpenCLKernel CLKernel = (OpenCLKernel )Kernel->Kernel;		OpenCLKernel CLKernel = (OpenCLKernel )Kernel->Kernel;
Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,		Ret = clGetKernelInfoFcnPtr(CLKernel->Kernel, CL_KERNEL_NUM_ARGS,
sizeof(cl_uint), &NumArgs, NULL);		sizeof(cl_uint), &NumArgs, NULL);
checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");		checkOpenCLError(Ret, "Failed to get number of kernel arguments.\n");

// TODO: Pass the size of the kernel arguments in to launchKernelCL, along		/* Argument sizes are stored at the end of the Parameters array. */
// with the arguments themselves. This is a dirty workaround that can be
// broken.
for (cl_uint i = 0; i < NumArgs; i++) {		for (cl_uint i = 0; i < NumArgs; i++) {
Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 8, (void *)Parameters[i]);		Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i,
if (Ret == CL_INVALID_ARG_SIZE) {		((int )Parameters[NumArgs + i]),
Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 4, (void *)Parameters[i]);
if (Ret == CL_INVALID_ARG_SIZE) {
Ret =
clSetKernelArgFcnPtr(CLKernel->Kernel, i, 2, (void *)Parameters[i]);
if (Ret == CL_INVALID_ARG_SIZE) {
Ret = clSetKernelArgFcnPtr(CLKernel->Kernel, i, 1,
(void *)Parameters[i]);		(void *)Parameters[i]);
checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);		checkOpenCLError(Ret, "Failed to set Kernel argument %d.\n", i);
}		}
}
}
if (Ret != CL_SUCCESS && Ret != CL_INVALID_ARG_SIZE) {
fprintf(stderr, "Failed to set Kernel argument.\n");
printOpenCLError(Ret);
exit(-1);
}
}

unsigned int GridDimZ = 1;		unsigned int GridDimZ = 1;
size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,		size_t GlobalWorkSize[3] = {BlockDimX * GridDimX, BlockDimY * GridDimY,
BlockDimZ * GridDimZ};		BlockDimZ * GridDimZ};
size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};		size_t LocalWorkSize[3] = {BlockDimX, BlockDimY, BlockDimZ};

static const int WorkDim = 3;		static const int WorkDim = 3;
OpenCLContext CLContext = (OpenCLContext )GlobalContext->Context;		OpenCLContext CLContext = (OpenCLContext )GlobalContext->Context;
▲ Show 20 Lines • Show All 1,063 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 98186

lib/CodeGen/PPCGCodeGeneration.cpp

test/GPGPU/cuda-managed-memory-simple.ll

test/GPGPU/host-control-flow.ll

test/GPGPU/kernel-params-only-some-arrays.ll

test/GPGPU/parametric-loop-bound.ll

tools/GPURuntime/GPUJIT.c

This is an archive of the discontinued LLVM Phabricator instance.

[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGenClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 98186

lib/CodeGen/PPCGCodeGeneration.cpp

test/GPGPU/cuda-managed-memory-simple.ll

test/GPGPU/host-control-flow.ll

test/GPGPU/kernel-params-only-some-arrays.ll

test/GPGPU/parametric-loop-bound.ll

tools/GPURuntime/GPUJIT.c

[Polly][PPCGCodeGen] OpenCL now gets kernel argument size from PPCG CodeGen
ClosedPublic